knitr::opts_chunk$set(cache = TRUE, dev = c("png", "pdf"))
library(here)
source(here("src", "model_linear_grampians.R"))
source(here("src", "environmental_distance_metrics.R"))
library(bossMaps)
library(dplyr)
library(ggplot2)
library(gridExtra)
#library(rstanarm)
library(tidyr)

The following is a validation of the predictions/inference from a trait-environment model of the Grampians Bioregion ~20 taxa dataset.

The following steps are undertaken:

mds <- local({
  nms1 <-   c(
    "mean_moisture_index_of_low_qtr", "topographic_wetness_index_3sec",
    "relief_1000m_radius", "total_nitrogen"
  )
  nms2 <- c("sla_mm2_per_mg", "seed_mass_mg", "max_height_m")
  vars <- lapply(nms1, function(x) log(grampians[[x]]))
  vars <- do.call(cbind, vars)
  vars <- cbind(
    vars, do.call(cbind, lapply(nms2, function(x) log(grampians[[x]])))
  )
  vars <- scale(vars)
  cntrs <- attr(vars, "scaled:center")
  scales <- attr(vars, "scaled:scale")

  vars_new_raw <- na.omit(southeast)
  vars_new <- lapply(seq_along(nms1), function(x) log(vars_new_raw[[nms1[x]]]))
  vars_new <- do.call(cbind, vars_new)
  vars_new <- cbind(
    vars_new, do.call(cbind, lapply(nms2, function(x) log(vars_new_raw[[x]])))
  )
  vars_new <- scale(vars_new, center = cntrs, scale = scales)

  setNames(
    data.frame(
      vars_new_raw[, c("occupancy", "taxon", "ibra_subregion")], vars_new
    ),
    c(
      "y", "taxon", "ibra_subregion", "mlq", "twi", "r1k", "tn", "sla", "sm",
      "mh"
    )
  )
})
perf <- local({
  perf <- mapply(
    function(f, re.form) {
      f(
        model_grampians, y, list(taxon, ibra_subregion),
        rbind(mds, mdg[names(mds)]), re.form
      )
    },
    f = rep(
      list(auroc, pred_inf, dev_explained, auprc, auprc_rel, auprc_rand),
      each = 2
    ),
    re.form = list(NULL,  NA)
  )
  taxon <- sub("(.*)\\..*", "\\1", rownames(perf))
  ibra_subregion <- sub(".*\\.", "", rownames(perf))
  dim(perf) <- dim(perf) * c(2, .5)
  colnames(perf) <- c(
    "AUC", "mean_predictive_info", "deviance_explained", "AUC-PR", "Rel-AUC-PR",
    "Rand-AUC-PR"
  )
  cbind(
    taxon = rep(taxon, 2),
    ibra_subregion = rep(ibra_subregion, 2),
    random_effect = rep(c(TRUE, FALSE), each = nrow(perf) / 2),
    as.data.frame(perf)
  )
})
taxon_region_prevalence <- summarise(
  group_by(rbind(mds, mdg[names(mds)]), taxon, ibra_subregion),
  prevalence = mean(y))

aucdist <- data.frame(
  perf[c("taxon", "ibra_subregion", "AUC", "AUC-PR")],
  `Geographic (km)`= dists[as.character(perf$ibra_subregion)],
  Environmental = kldists[as.character(perf$ibra_subregion)],
  Community = community_dist[as.character(perf$ibra_subregion)],
  "Dummy" = "Dummy",
  check.names = FALSE
)

aucdist <- aucdist[!perf$random_effect, ]

# Pete: "This retains two values for each model predciton, for random_effect =
#        TRUE or FALSE."
# Will: "Yes, effectively making predictions with knowledge of species ID or
#        with trait information only."

aucdist <- gather(
  aucdist, "distance", "value", `Geographic (km)`, Environmental, Community
)

aucdist <- left_join(aucdist, taxon_region_prevalence)

aucdistmean <- summarise(
  group_by(aucdist, ibra_subregion, distance),
  medauc = median(AUC),
  medauprc = median(`AUC-PR` / prevalence),
  value = mean(value)
)

aucdistplot <- ggplot(aucdist) +
  aes(AUC, value) +
  geom_vline(xintercept = .5, color = "grey", size = 1.5) +
  geom_point(alpha=.1) +
  geom_point(
    aes(medauc), data = aucdistmean, size = 2, shape = 21, fill = "white"
  ) +
  xlim(.4, 1) +
  ylab("Distance") +
  facet_grid(. ~ distance, scales = "free_x") +
  coord_flip() +
  theme_bw() +
  theme(
    axis.title.y = element_blank(),
    axis.text.y = element_blank(),
    axis.ticks.y = element_blank(),
    axis.title.x = element_blank(),
    axis.text.x = element_blank(),
    axis.ticks.x = element_blank(),
    strip.background = element_blank()
  )

aucbw <- ggplot(aucdist) +
  aes(x = Dummy, y = AUC) +
  geom_boxplot(fill = "grey") +
  facet_grid(. ~ Dummy) +
  ylim(.4, 1) +
  scale_x_discrete(labels = "0.95") +
  xlab("Dummy") +
  theme_bw() +
  theme(
    axis.title.x = element_blank(),
    axis.text.x  = element_blank(),
    axis.ticks.x = element_blank(),
    axis.title.y = element_blank(),
    axis.text.y = element_blank(),
    axis.ticks.y = element_blank(),
    panel.grid.major.x = element_blank(),
    panel.grid.minor.x = element_blank(),
    strip.background = element_blank(),
    strip.text.x = element_text(color = "transparent"),
    plot.margin = margin(5.5, 0, 5.5, 5.5)
  )

aucplot_gg <- ggplot(filter(aucdist, ibra_subregion == "Greater Grampians")) +
  aes(x = AUC, y = 1) +
  geom_vline(xintercept = .5, color = "grey", size = 1.5) +
  geom_point(alpha=.1) +
  geom_point(
    aes(medauc),
    data = slice(filter(aucdistmean, ibra_subregion == "Greater Grampians"), 1L),
    size = 2, shape = 21, fill = "white"
  ) +
  xlim(.4, 1) +
  ylab("Distance") +
  xlab("AUROC") +
  facet_grid(. ~ Dummy) +
  coord_flip() +
  theme_bw() +
  theme(
    axis.title.x = element_blank(),
    axis.text.x  = element_blank(),
    axis.ticks.x = element_blank(),
    panel.grid.major.x = element_blank(),
    panel.grid.minor.x = element_blank(),
    strip.background = element_blank(),
    strip.text.x = element_text(color = "transparent"),
    plot.margin = margin(5.5, 0, 5.5, 5.5)
  )

auprcdistplot <- ggplot(aucdist) +
  aes(`AUC-PR` / prevalence, value) +
  scale_x_continuous(
    trans = "log2", breaks = 2^seq(-1, 8, 2), labels = 2^seq(-1, 8, 2),
    limits = c(.5, 170)
  ) +
  geom_vline(xintercept = 1, color = "grey", size = 1.5) +
  geom_point(alpha=.1) +
  geom_point(
    aes(medauprc), data = aucdistmean, size = 2, shape = 21, fill = "white"
  ) +
  ylab("Distance") +
  facet_grid(. ~ distance, scales = "free_x") +
  coord_flip() +
  theme_bw() +
  theme(
    axis.title.y = element_blank(),
    axis.text.y = element_blank(),
    axis.ticks.y = element_blank(),
    strip.background = element_blank(),
    strip.text.x = element_blank()
  )

auprcbw <- ggplot(aucdist) +
  aes(x = Dummy, y = `AUC-PR` / prevalence) +
  scale_y_continuous(trans = "log2", limits = c(.5, 170)) +
  geom_boxplot(fill = "grey") +
  facet_grid(. ~ Dummy) +
  scale_x_discrete(labels = "0.95") +
  xlab("Dummy") +
  theme_bw() +
  theme(
    axis.title.x = element_text(color = "transparent"),
    axis.text.x  = element_text(color = "transparent"),
    axis.ticks.x = element_line(color = "transparent"),
    axis.title.y = element_blank(),
    axis.text.y = element_blank(),
    axis.ticks.y = element_blank(),
    panel.grid.major.x = element_blank(),
    panel.grid.minor.x = element_blank(),
    strip.background = element_blank(),
    strip.text.x = element_blank(),
    plot.margin = margin(5.5, 0, 5.5, 5.5)
  )

auprcplot_gg <- ggplot(filter(aucdist, ibra_subregion == "Greater Grampians")) +
  aes(x = `AUC-PR` / prevalence, y = 1) +
  scale_x_continuous(
    trans = "log2", breaks = 2^seq(-1, 8, 2), labels = 2^seq(-1, 8, 2),
    limits = c(.5, 170)
  ) +
  geom_vline(xintercept = 1, color = "grey", size = 1.5) +
  geom_point(alpha=.1) +
  geom_point(
    aes(medauprc),
    data =
      slice(filter(aucdistmean, ibra_subregion == "Greater Grampians"), 1L),
    size = 2, shape = 21, fill = "white"
  ) +
  ylab("Distance") +
  xlab("AUPRC / Prevalence") +
  facet_grid(. ~ Dummy) +
  coord_flip() +
  theme_bw() +
  theme(
    axis.title.x = element_text(color = "transparent"),
    axis.text.x  = element_text(color = "transparent"),
    axis.ticks.x = element_line(color = "transparent"),
    panel.grid.major.x = element_blank(),
    panel.grid.minor.x = element_blank(),
    strip.background = element_blank(),
    strip.text.x = element_blank(),
    plot.margin = margin(5.5, 0, 5.5, 5.5)
  )

grid.arrange(
  aucplot_gg, aucbw, aucdistplot, auprcplot_gg, auprcbw, auprcdistplot,
  nrow = 2, widths = c(.12, .08, .8), heights = c(.49, .51)
)

Boxplot statistics

boxplot.stats(aucdist$AUC)
$stats
[1] 0.4215136 0.5685906 0.6539451 0.7735350 0.9946237

$n
[1] 1740

$conf
[1] 0.6461823 0.6617079

$out
numeric(0)
boxplot.stats(aucdist$`AUC-PR` / aucdist$prevalence)
$stats
[1] 0.5029651 0.8867251 1.2357457 2.1772197 3.9946279

$n
[1] 1740

$conf
[1] 1.186865 1.284627

$out
  [1]   6.264201  11.029269   5.019779   6.194359   4.447007  17.060637
  [7] 166.550516  10.339579   4.468890   5.349636   4.286554   4.177586
 [13]   4.590621   5.202100   5.562653   4.394573   8.573941   6.993234
 [19]   6.819060   5.304599   6.190532  12.412588  48.896500   4.648834
 [25]  12.633072   6.219755   6.904140  57.381477   7.504053   6.509198
 [31]  43.937150  12.936753   5.292922  40.670476   9.529626   6.535023
 [37]   8.369079   4.933171  61.103639   4.365564   4.818119   6.961859
 [43]  10.586430   4.843684  26.497484   5.904673  13.011292   5.033057
 [49]   8.044636   5.558887   4.540611  39.155428   5.168444   6.487082
 [55]  28.285509  13.719953   8.633631  12.569967  53.620160   7.464814
 [61]   5.578885  16.645197   4.539642   6.264201  11.029269   5.019779
 [67]   6.194359   4.447007  17.060637 166.550516  10.339579   4.468890
 [73]   5.349636   4.286554   4.177586   4.590621   5.202100   5.562653
 [79]   4.394573   8.573941   6.993234   6.819060   5.304599   6.190532
 [85]  12.412588  48.896500   4.648834  12.633072   6.219755   6.904140
 [91]  57.381477   7.504053   6.509198  43.937150  12.936753   5.292922
 [97]  40.670476   9.529626   6.535023   8.369079   4.933171  61.103639
[103]   4.365564   4.818119   6.961859  10.586430   4.843684  26.497484
[109]   5.904673  13.011292   5.033057   8.044636   5.558887   4.540611
[115]  39.155428   5.168444   6.487082  28.285509  13.719953   8.633631
[121]  12.569967  53.620160   7.464814   5.578885  16.645197   4.539642
[127]   6.264201  11.029269   5.019779   6.194359   4.447007  17.060637
[133] 166.550516  10.339579   4.468890   5.349636   4.286554   4.177586
[139]   4.590621   5.202100   5.562653   4.394573   8.573941   6.993234
[145]   6.819060   5.304599   6.190532  12.412588  48.896500   4.648834
[151]  12.633072   6.219755   6.904140  57.381477   7.504053   6.509198
[157]  43.937150  12.936753   5.292922  40.670476   9.529626   6.535023
[163]   8.369079   4.933171  61.103639   4.365564   4.818119   6.961859
[169]  10.586430   4.843684  26.497484   5.904673  13.011292   5.033057
[175]   8.044636   5.558887   4.540611  39.155428   5.168444   6.487082
[181]  28.285509  13.719953   8.633631  12.569967  53.620160   7.464814
[187]   5.578885  16.645197   4.539642
aucdist_so <- filter(aucdist, taxon %in% intersect(mds$taxon, mdg$taxon))

aucdistmean_so <- summarise(
  group_by(aucdist_so, ibra_subregion, distance),
  medauc = median(AUC),
  medauprc = median(`AUC-PR` / prevalence),
  value = mean(value)
)

aucdistplot_so <- ggplot(aucdist_so) +
  aes(AUC, value) +
  geom_vline(xintercept = .5, color = "grey", size = 1.5) +
  geom_point(alpha = .1) +
  geom_point(
    aes(medauc), data = aucdistmean_so, size = 2, shape = 21, fill = "white"
  ) +
  xlim(.4, 1) +
  ylab("Distance") +
  facet_grid(. ~ distance, scales = "free_x") +
  coord_flip() +
  theme_bw() +
  theme(
    axis.title.y = element_blank(),
    axis.text.y = element_blank(),
    axis.ticks.y = element_blank(),
    axis.title.x = element_blank(),
    axis.text.x = element_blank(),
    axis.ticks.x = element_blank(),
    strip.background = element_blank()
  )

aucbw_so <- ggplot(aucdist_so) +
  aes(x = Dummy, y = AUC) +
  geom_boxplot(fill = "grey") +
  facet_grid(. ~ Dummy) +
  ylim(.4, 1) +
  scale_x_discrete(labels = "0.95") +
  xlab("Dummy") +
  theme_bw() +
  theme(
    axis.title.x = element_blank(),
    axis.text.x  = element_blank(),
    axis.ticks.x = element_blank(),
    axis.title.y = element_blank(),
    axis.text.y = element_blank(),
    axis.ticks.y = element_blank(),
    panel.grid.major.x = element_blank(),
    panel.grid.minor.x = element_blank(),
    strip.background = element_blank(),
    strip.text.x = element_text(color = "transparent"),
    plot.margin = margin(5.5, 0, 5.5, 5.5)
  )

aucplot_gg_so <- ggplot(filter(aucdist_so, ibra_subregion == "Greater Grampians")) +
  aes(x = AUC, y = 1) +
  geom_vline(xintercept = .5, color = "grey", size = 1.5) +
  geom_point(alpha=.1) +
  geom_point(
    aes(medauc),
    data = slice(filter(aucdistmean_so, ibra_subregion == "Greater Grampians"), 1L),
    size = 2, shape = 21, fill = "white"
  ) +
  xlim(.4, 1) +
  ylab("Distance") +
  xlab("AUROC") +
  facet_grid(. ~ Dummy) +
  coord_flip() +
  theme_bw() +
  theme(
    axis.title.x = element_blank(),
    axis.text.x  = element_blank(),
    axis.ticks.x = element_blank(),
    panel.grid.major.x = element_blank(),
    panel.grid.minor.x = element_blank(),
    strip.background = element_blank(),
    strip.text.x = element_text(color = "transparent"),
    plot.margin = margin(5.5, 0, 5.5, 5.5)
  )

auprcdistplot_so <- ggplot(aucdist_so) +
  aes(`AUC-PR` / prevalence, value) +
  scale_x_continuous(
    trans = "log2", breaks = 2^seq(-1, 8, 2), labels = 2^seq(-1, 8, 2),
    limits = c(.5, 170)
  ) +
  geom_vline(xintercept = 1, color = "grey", size = 1.5) +
  geom_point(alpha=.1) +
  geom_point(
    aes(medauprc), data = aucdistmean_so, size = 2, shape = 21, fill = "white"
  ) +
  ylab("Distance") +
  facet_grid(. ~ distance, scales = "free_x") +
  coord_flip() +
  theme_bw() +
  theme(
    axis.title.y = element_blank(),
    axis.text.y = element_blank(),
    axis.ticks.y = element_blank(),
    strip.background = element_blank(),
    strip.text.x = element_blank()
  )

auprcbw_so <- ggplot(aucdist_so) +
  aes(x = Dummy, y = `AUC-PR` / prevalence) +
  scale_y_continuous(trans = "log2", limits = c(.5, 170)) +
  geom_boxplot(fill = "grey") +
  facet_grid(. ~ Dummy) +
  scale_x_discrete(labels = "0.95") +
  xlab("Dummy") +
  theme_bw() +
  theme(
    axis.title.x = element_text(color = "transparent"),
    axis.text.x  = element_text(color = "transparent"),
    axis.ticks.x = element_line(color = "transparent"),
    axis.title.y = element_blank(),
    axis.text.y = element_blank(),
    axis.ticks.y = element_blank(),
    panel.grid.major.x = element_blank(),
    panel.grid.minor.x = element_blank(),
    strip.background = element_blank(),
    strip.text.x = element_blank(),
    plot.margin = margin(5.5, 0, 5.5, 5.5)
  )

auprcplot_gg_so <- ggplot(filter(aucdist_so, ibra_subregion == "Greater Grampians")) +
  aes(x = `AUC-PR` / prevalence, y = 1) +
  scale_x_continuous(
    trans = "log2", breaks = 2^seq(-1, 8, 2), labels = 2^seq(-1, 8, 2),
    limits = c(.5, 170)
  ) +
  geom_vline(xintercept = 1, color = "grey", size = 1.5) +
  geom_point(alpha=.1) +
  geom_point(
    aes(medauprc),
    data =
      slice(filter(aucdistmean_so, ibra_subregion == "Greater Grampians"), 1L),
    size = 2, shape = 21, fill = "white"
  ) +
  ylab("Distance") +
  xlab("AUPRC / Prevalence") +
  facet_grid(. ~ Dummy) +
  coord_flip() +
  theme_bw() +
  theme(
    axis.title.x = element_text(color = "transparent"),
    axis.text.x  = element_text(color = "transparent"),
    axis.ticks.x = element_line(color = "transparent"),
    panel.grid.major.x = element_blank(),
    panel.grid.minor.x = element_blank(),
    strip.background = element_blank(),
    strip.text.x = element_blank(),
    plot.margin = margin(5.5, 0, 5.5, 5.5)
  )

grid.arrange(
  aucplot_gg_so, aucbw_so, aucdistplot_so, auprcplot_gg_so, auprcbw_so,
  auprcdistplot_so,
  nrow = 2, widths = c(.12, .08, .8), heights = c(.49, .51)
)

devdist <- data.frame(
  perf[c("taxon", "ibra_subregion", "deviance_explained")],
  `Geographic (km)`= dists[as.character(perf$ibra_subregion)],
  Environmental = kldists[as.character(perf$ibra_subregion)],
  Community = community_dist[as.character(perf$ibra_subregion)],
  "Dummy" = "Dummy",
  check.names = FALSE
)

devdist <- gather(
  devdist, "distance", "value", `Geographic (km)`, Environmental, Community
)

devdist <- na.omit(devdist)

colnames(devdist)[3] <- "Deviance explained"

devdistmean <- summarise(
  group_by(devdist, ibra_subregion, distance),
  meandev = mean(`Deviance explained`), value = mean(value)
)

devdistplot <- ggplot(devdist) +
  aes(`Deviance explained`, value) +
  geom_vline(xintercept = 0, color = "grey", size = 1.5) +
  geom_point(alpha=.1) +
  geom_point(
    aes(meandev), data = devdistmean, size = 2, shape = 21, fill = "white"
  ) +
  scale_x_continuous(trans = "asinh", limits = c(-56, 1)) +
  ylab("Distance") +
  facet_grid(. ~ distance, scales = "free_x") +
  coord_flip() +
  theme_bw() +
  theme(
    axis.title.y = element_blank(),
    axis.text.y = element_blank(),
    axis.ticks.y = element_blank(),
    strip.background = element_blank()
  )

devbw <- ggplot(devdist) +
  aes(x = Dummy, y = `Deviance explained`) +
  geom_boxplot(fill = "grey") +
  facet_grid(. ~ Dummy) +
  scale_y_continuous(trans = "asinh", limits = c(-56, 1)) +
  scale_x_discrete(labels = "0.95") +
  xlab("Dummy") +
  theme_bw() +
  theme(
    axis.title.x = element_text(color = "transparent"),
    axis.text.x  = element_text(color = "transparent"),
    axis.ticks.x = element_line(color = "transparent"),
    strip.background = element_blank(),
    strip.text.x = element_text(color = "transparent"),
    plot.margin = margin(5.5, 0, 5.5, 5.5)
  )

grid.arrange(devbw, devdistplot, nrow = 1, widths = c(.14, .86))

ggplot(
  transform(
    subset(perf, ibra_subregion != "Greater Grampians" & !random_effect),
    ibra_subregion = factor(
      ibra_subregion, names(sort(tapply(AUC, ibra_subregion, median)))
    )
  )
) +
aes(AUC) + geom_histogram() +
facet_wrap(
  ~ibra_subregion, 6,
  labeller = as_labeller(
    function(x) {
      x <- gsub("-", "- ", x)
      x <- strwrap(x, width = 13, simplify = FALSE)
      vapply(x, paste, character(1), collapse = "\n")
    }
  )
) +
xlab("AUC") +
ylab("No. of taxa") +
theme_bw() +
theme(axis.text.x = element_text(size = 8))

ggplot(
  transform(
    subset(
      left_join(perf, taxon_region_prevalence),
      ibra_subregion != "Greater Grampians" & !random_effect
    ),
    ibra_subregion = factor(
      ibra_subregion, names(sort(tapply(AUC, ibra_subregion, median)))
    )
  )
) +
aes(prevalence, AUC) +
geom_point(alpha = .5) +
facet_wrap(
  ~ibra_subregion, 6,
  labeller = as_labeller(
    function(x) {
      x <- gsub("-", "- ", x)
      x <- strwrap(x, width = 13, simplify = FALSE)
      vapply(x, paste, character(1), collapse = "\n")
    }
  )
) +
xlab("Prevalence") +
ylab("AUC") +
theme_bw() +
theme(axis.text.x = element_text(size = 8))

ggplot(
  transform(
    subset(perf, ibra_subregion != "Greater Grampians" & !random_effect),
    ibra_subregion = factor(
      ibra_subregion,
      names(sort(tapply(mean_predictive_info, ibra_subregion, median)))
    )
  )
) +
aes(mean_predictive_info) + geom_histogram() + facet_wrap(~ibra_subregion, 3) +
xlab("Mean Predictive Information") +
ylab("No. of taxa") +
ggtitle(
  paste(
    "Mean Predictive Information for taxa within IBRA subregions of",
    "Southeast Australia"
  )
)

ggplot(
  transform(
    subset(perf, taxon %in% mdg$taxon & random_effect),
    taxon = factor(
      taxon,
      names(
        sort(
          tapply(
            AUC[ibra_subregion == "Greater Grampians"],
            taxon[ibra_subregion == "Greater Grampians"],
            median
          )
        )
      )
    )
  )
) +
aes(
  AUC,
  mean_predictive_info,
  col = ifelse(
    ibra_subregion == "Greater Grampians",
    "Grampians",
    "Other Region"
  )
) +
geom_point() +
facet_wrap(~taxon, labeller = label_wrap_gen(40)) +
labs(col = NULL) +
xlab("AUC") +
ylab("Mean Predictive Information") +
ggtitle(
  paste(
    ""
  )
) +
theme(legend.position = "bottom")

#model_auc <- stan_glmer(
#  (AUC - .001) ~ value + (1 | ibra_subregion/taxon),
#  family = mgcv::betar,
#  iter = 1000, chains = 3,
#  data = aucdist,
#  subset = ibra_subregion != "Greater Grampians" & distance == "Geographic (km)"
#)
#summary(model_auc, regex_pars = "^[^b]", probs = c(.025, .975))
#' ---
#' title: "Validation of linear model"
#' author: "William K. Morris"
#' date: "`r Sys.Date()`"
#' output:
#'   rmarkdown::html_notebook:
#'     code_folding: hide
#' ---

#+ setup, message=FALSE, fig.keep="none", fig.show="hide"
knitr::opts_chunk$set(cache = TRUE, dev = c("png", "pdf"))
library(here)
source(here("src", "model_linear_grampians.R"))
source(here("src", "environmental_distance_metrics.R"))
library(bossMaps)
library(dplyr)
library(ggplot2)
library(gridExtra)
#library(rstanarm)
library(tidyr)

#' The following is a validation of the predictions/inference from a
#' trait-environment model of the Grampians Bioregion ~20 taxa dataset.
#'
#' The following steps are undertaken:
#'
#' * Fit a model to Grampians data.
#' * Calculate Performance Metrics:
#'     * Predict probability of occurrence
#'         * Predict within Grampians training set:
#'             * Predict for each species including species random effects.
#'             * Predict for each species without species random effects.
#'         * Predict to the South-east validation set:
#'             * Predict to each IBRA subregion within the South-east:
#'                 * Predict for each species common to Grampians and South-east
#'                   subregion:
#'                     * Predict including species random effects.
#'                     * Predict without species random effects.
#'                 * Predict for each species not found in the Grampians
#'                   training set.
#'     * For each prediction type (within-sample/out-of-sample,
#'       with-re/without-re, and modelled-species/unmodelled-species),
#'       bioregion, and species (and their combinations) calculate performance
#'       metrics

#+ mds
mds <- local({
  nms1 <-   c(
    "mean_moisture_index_of_low_qtr", "topographic_wetness_index_3sec",
    "relief_1000m_radius", "total_nitrogen"
  )
  nms2 <- c("sla_mm2_per_mg", "seed_mass_mg", "max_height_m")
  vars <- lapply(nms1, function(x) log(grampians[[x]]))
  vars <- do.call(cbind, vars)
  vars <- cbind(
    vars, do.call(cbind, lapply(nms2, function(x) log(grampians[[x]])))
  )
  vars <- scale(vars)
  cntrs <- attr(vars, "scaled:center")
  scales <- attr(vars, "scaled:scale")

  vars_new_raw <- na.omit(southeast)
  vars_new <- lapply(seq_along(nms1), function(x) log(vars_new_raw[[nms1[x]]]))
  vars_new <- do.call(cbind, vars_new)
  vars_new <- cbind(
    vars_new, do.call(cbind, lapply(nms2, function(x) log(vars_new_raw[[x]])))
  )
  vars_new <- scale(vars_new, center = cntrs, scale = scales)

  setNames(
    data.frame(
      vars_new_raw[, c("occupancy", "taxon", "ibra_subregion")], vars_new
    ),
    c(
      "y", "taxon", "ibra_subregion", "mlq", "twi", "r1k", "tn", "sla", "sm",
      "mh"
    )
  )
})

#+ perf
perf <- local({
  perf <- mapply(
    function(f, re.form) {
      f(
        model_grampians, y, list(taxon, ibra_subregion),
        rbind(mds, mdg[names(mds)]), re.form
      )
    },
    f = rep(
      list(auroc, pred_inf, dev_explained, auprc, auprc_rel, auprc_rand),
      each = 2
    ),
    re.form = list(NULL,  NA)
  )
  taxon <- sub("(.*)\\..*", "\\1", rownames(perf))
  ibra_subregion <- sub(".*\\.", "", rownames(perf))
  dim(perf) <- dim(perf) * c(2, .5)
  colnames(perf) <- c(
    "AUC", "mean_predictive_info", "deviance_explained", "AUC-PR", "Rel-AUC-PR",
    "Rand-AUC-PR"
  )
  cbind(
    taxon = rep(taxon, 2),
    ibra_subregion = rep(ibra_subregion, 2),
    random_effect = rep(c(TRUE, FALSE), each = nrow(perf) / 2),
    as.data.frame(perf)
  )
})

#+ plot-auc-dist, message = FALSE, warning = FALSE, fig.width = 7.5, fig.height = 4.5
taxon_region_prevalence <- summarise(
  group_by(rbind(mds, mdg[names(mds)]), taxon, ibra_subregion),
  prevalence = mean(y))

aucdist <- data.frame(
  perf[c("taxon", "ibra_subregion", "AUC", "AUC-PR")],
  `Geographic (km)`= dists[as.character(perf$ibra_subregion)],
  Environmental = kldists[as.character(perf$ibra_subregion)],
  Community = community_dist[as.character(perf$ibra_subregion)],
  "Dummy" = "Dummy",
  check.names = FALSE
)

aucdist <- aucdist[!perf$random_effect, ]

# Pete: "This retains two values for each model predciton, for random_effect =
#        TRUE or FALSE."
# Will: "Yes, effectively making predictions with knowledge of species ID or
#        with trait information only."

aucdist <- gather(
  aucdist, "distance", "value", `Geographic (km)`, Environmental, Community
)

aucdist <- left_join(aucdist, taxon_region_prevalence)

aucdistmean <- summarise(
  group_by(aucdist, ibra_subregion, distance),
  medauc = median(AUC),
  medauprc = median(`AUC-PR` / prevalence),
  value = mean(value)
)

aucdistplot <- ggplot(aucdist) +
  aes(AUC, value) +
  geom_vline(xintercept = .5, color = "grey", size = 1.5) +
  geom_point(alpha=.1) +
  geom_point(
    aes(medauc), data = aucdistmean, size = 2, shape = 21, fill = "white"
  ) +
  xlim(.4, 1) +
  ylab("Distance") +
  facet_grid(. ~ distance, scales = "free_x") +
  coord_flip() +
  theme_bw() +
  theme(
    axis.title.y = element_blank(),
    axis.text.y = element_blank(),
    axis.ticks.y = element_blank(),
    axis.title.x = element_blank(),
    axis.text.x = element_blank(),
    axis.ticks.x = element_blank(),
    strip.background = element_blank()
  )

aucbw <- ggplot(aucdist) +
  aes(x = Dummy, y = AUC) +
  geom_boxplot(fill = "grey") +
  facet_grid(. ~ Dummy) +
  ylim(.4, 1) +
  scale_x_discrete(labels = "0.95") +
  xlab("Dummy") +
  theme_bw() +
  theme(
    axis.title.x = element_blank(),
    axis.text.x  = element_blank(),
    axis.ticks.x = element_blank(),
    axis.title.y = element_blank(),
    axis.text.y = element_blank(),
    axis.ticks.y = element_blank(),
    panel.grid.major.x = element_blank(),
    panel.grid.minor.x = element_blank(),
    strip.background = element_blank(),
    strip.text.x = element_text(color = "transparent"),
    plot.margin = margin(5.5, 0, 5.5, 5.5)
  )

aucplot_gg <- ggplot(filter(aucdist, ibra_subregion == "Greater Grampians")) +
  aes(x = AUC, y = 1) +
  geom_vline(xintercept = .5, color = "grey", size = 1.5) +
  geom_point(alpha=.1) +
  geom_point(
    aes(medauc),
    data = slice(filter(aucdistmean, ibra_subregion == "Greater Grampians"), 1L),
    size = 2, shape = 21, fill = "white"
  ) +
  xlim(.4, 1) +
  ylab("Distance") +
  xlab("AUROC") +
  facet_grid(. ~ Dummy) +
  coord_flip() +
  theme_bw() +
  theme(
    axis.title.x = element_blank(),
    axis.text.x  = element_blank(),
    axis.ticks.x = element_blank(),
    panel.grid.major.x = element_blank(),
    panel.grid.minor.x = element_blank(),
    strip.background = element_blank(),
    strip.text.x = element_text(color = "transparent"),
    plot.margin = margin(5.5, 0, 5.5, 5.5)
  )

auprcdistplot <- ggplot(aucdist) +
  aes(`AUC-PR` / prevalence, value) +
  scale_x_continuous(
    trans = "log2", breaks = 2^seq(-1, 8, 2), labels = 2^seq(-1, 8, 2),
    limits = c(.5, 170)
  ) +
  geom_vline(xintercept = 1, color = "grey", size = 1.5) +
  geom_point(alpha=.1) +
  geom_point(
    aes(medauprc), data = aucdistmean, size = 2, shape = 21, fill = "white"
  ) +
  ylab("Distance") +
  facet_grid(. ~ distance, scales = "free_x") +
  coord_flip() +
  theme_bw() +
  theme(
    axis.title.y = element_blank(),
    axis.text.y = element_blank(),
    axis.ticks.y = element_blank(),
    strip.background = element_blank(),
    strip.text.x = element_blank()
  )

auprcbw <- ggplot(aucdist) +
  aes(x = Dummy, y = `AUC-PR` / prevalence) +
  scale_y_continuous(trans = "log2", limits = c(.5, 170)) +
  geom_boxplot(fill = "grey") +
  facet_grid(. ~ Dummy) +
  scale_x_discrete(labels = "0.95") +
  xlab("Dummy") +
  theme_bw() +
  theme(
    axis.title.x = element_text(color = "transparent"),
    axis.text.x  = element_text(color = "transparent"),
    axis.ticks.x = element_line(color = "transparent"),
    axis.title.y = element_blank(),
    axis.text.y = element_blank(),
    axis.ticks.y = element_blank(),
    panel.grid.major.x = element_blank(),
    panel.grid.minor.x = element_blank(),
    strip.background = element_blank(),
    strip.text.x = element_blank(),
    plot.margin = margin(5.5, 0, 5.5, 5.5)
  )

auprcplot_gg <- ggplot(filter(aucdist, ibra_subregion == "Greater Grampians")) +
  aes(x = `AUC-PR` / prevalence, y = 1) +
  scale_x_continuous(
    trans = "log2", breaks = 2^seq(-1, 8, 2), labels = 2^seq(-1, 8, 2),
    limits = c(.5, 170)
  ) +
  geom_vline(xintercept = 1, color = "grey", size = 1.5) +
  geom_point(alpha=.1) +
  geom_point(
    aes(medauprc),
    data =
      slice(filter(aucdistmean, ibra_subregion == "Greater Grampians"), 1L),
    size = 2, shape = 21, fill = "white"
  ) +
  ylab("Distance") +
  xlab("AUPRC / Prevalence") +
  facet_grid(. ~ Dummy) +
  coord_flip() +
  theme_bw() +
  theme(
    axis.title.x = element_text(color = "transparent"),
    axis.text.x  = element_text(color = "transparent"),
    axis.ticks.x = element_line(color = "transparent"),
    panel.grid.major.x = element_blank(),
    panel.grid.minor.x = element_blank(),
    strip.background = element_blank(),
    strip.text.x = element_blank(),
    plot.margin = margin(5.5, 0, 5.5, 5.5)
  )

grid.arrange(
  aucplot_gg, aucbw, aucdistplot, auprcplot_gg, auprcbw, auprcdistplot,
  nrow = 2, widths = c(.12, .08, .8), heights = c(.49, .51)
)

#' ### Boxplot statistics
#+ boxplot-stats, message = FALSE, warning = FALSE
boxplot.stats(aucdist$AUC)

boxplot.stats(aucdist$`AUC-PR` / aucdist$prevalence)

#+ plot-auc-dist-shared-only, message = FALSE, warning = FALSE, fig.width = 7.5, fig.height = 4.5
aucdist_so <- filter(aucdist, taxon %in% intersect(mds$taxon, mdg$taxon))

aucdistmean_so <- summarise(
  group_by(aucdist_so, ibra_subregion, distance),
  medauc = median(AUC),
  medauprc = median(`AUC-PR` / prevalence),
  value = mean(value)
)

aucdistplot_so <- ggplot(aucdist_so) +
  aes(AUC, value) +
  geom_vline(xintercept = .5, color = "grey", size = 1.5) +
  geom_point(alpha = .1) +
  geom_point(
    aes(medauc), data = aucdistmean_so, size = 2, shape = 21, fill = "white"
  ) +
  xlim(.4, 1) +
  ylab("Distance") +
  facet_grid(. ~ distance, scales = "free_x") +
  coord_flip() +
  theme_bw() +
  theme(
    axis.title.y = element_blank(),
    axis.text.y = element_blank(),
    axis.ticks.y = element_blank(),
    axis.title.x = element_blank(),
    axis.text.x = element_blank(),
    axis.ticks.x = element_blank(),
    strip.background = element_blank()
  )

aucbw_so <- ggplot(aucdist_so) +
  aes(x = Dummy, y = AUC) +
  geom_boxplot(fill = "grey") +
  facet_grid(. ~ Dummy) +
  ylim(.4, 1) +
  scale_x_discrete(labels = "0.95") +
  xlab("Dummy") +
  theme_bw() +
  theme(
    axis.title.x = element_blank(),
    axis.text.x  = element_blank(),
    axis.ticks.x = element_blank(),
    axis.title.y = element_blank(),
    axis.text.y = element_blank(),
    axis.ticks.y = element_blank(),
    panel.grid.major.x = element_blank(),
    panel.grid.minor.x = element_blank(),
    strip.background = element_blank(),
    strip.text.x = element_text(color = "transparent"),
    plot.margin = margin(5.5, 0, 5.5, 5.5)
  )

aucplot_gg_so <- ggplot(filter(aucdist_so, ibra_subregion == "Greater Grampians")) +
  aes(x = AUC, y = 1) +
  geom_vline(xintercept = .5, color = "grey", size = 1.5) +
  geom_point(alpha=.1) +
  geom_point(
    aes(medauc),
    data = slice(filter(aucdistmean_so, ibra_subregion == "Greater Grampians"), 1L),
    size = 2, shape = 21, fill = "white"
  ) +
  xlim(.4, 1) +
  ylab("Distance") +
  xlab("AUROC") +
  facet_grid(. ~ Dummy) +
  coord_flip() +
  theme_bw() +
  theme(
    axis.title.x = element_blank(),
    axis.text.x  = element_blank(),
    axis.ticks.x = element_blank(),
    panel.grid.major.x = element_blank(),
    panel.grid.minor.x = element_blank(),
    strip.background = element_blank(),
    strip.text.x = element_text(color = "transparent"),
    plot.margin = margin(5.5, 0, 5.5, 5.5)
  )

auprcdistplot_so <- ggplot(aucdist_so) +
  aes(`AUC-PR` / prevalence, value) +
  scale_x_continuous(
    trans = "log2", breaks = 2^seq(-1, 8, 2), labels = 2^seq(-1, 8, 2),
    limits = c(.5, 170)
  ) +
  geom_vline(xintercept = 1, color = "grey", size = 1.5) +
  geom_point(alpha=.1) +
  geom_point(
    aes(medauprc), data = aucdistmean_so, size = 2, shape = 21, fill = "white"
  ) +
  ylab("Distance") +
  facet_grid(. ~ distance, scales = "free_x") +
  coord_flip() +
  theme_bw() +
  theme(
    axis.title.y = element_blank(),
    axis.text.y = element_blank(),
    axis.ticks.y = element_blank(),
    strip.background = element_blank(),
    strip.text.x = element_blank()
  )

auprcbw_so <- ggplot(aucdist_so) +
  aes(x = Dummy, y = `AUC-PR` / prevalence) +
  scale_y_continuous(trans = "log2", limits = c(.5, 170)) +
  geom_boxplot(fill = "grey") +
  facet_grid(. ~ Dummy) +
  scale_x_discrete(labels = "0.95") +
  xlab("Dummy") +
  theme_bw() +
  theme(
    axis.title.x = element_text(color = "transparent"),
    axis.text.x  = element_text(color = "transparent"),
    axis.ticks.x = element_line(color = "transparent"),
    axis.title.y = element_blank(),
    axis.text.y = element_blank(),
    axis.ticks.y = element_blank(),
    panel.grid.major.x = element_blank(),
    panel.grid.minor.x = element_blank(),
    strip.background = element_blank(),
    strip.text.x = element_blank(),
    plot.margin = margin(5.5, 0, 5.5, 5.5)
  )

auprcplot_gg_so <- ggplot(filter(aucdist_so, ibra_subregion == "Greater Grampians")) +
  aes(x = `AUC-PR` / prevalence, y = 1) +
  scale_x_continuous(
    trans = "log2", breaks = 2^seq(-1, 8, 2), labels = 2^seq(-1, 8, 2),
    limits = c(.5, 170)
  ) +
  geom_vline(xintercept = 1, color = "grey", size = 1.5) +
  geom_point(alpha=.1) +
  geom_point(
    aes(medauprc),
    data =
      slice(filter(aucdistmean_so, ibra_subregion == "Greater Grampians"), 1L),
    size = 2, shape = 21, fill = "white"
  ) +
  ylab("Distance") +
  xlab("AUPRC / Prevalence") +
  facet_grid(. ~ Dummy) +
  coord_flip() +
  theme_bw() +
  theme(
    axis.title.x = element_text(color = "transparent"),
    axis.text.x  = element_text(color = "transparent"),
    axis.ticks.x = element_line(color = "transparent"),
    panel.grid.major.x = element_blank(),
    panel.grid.minor.x = element_blank(),
    strip.background = element_blank(),
    strip.text.x = element_blank(),
    plot.margin = margin(5.5, 0, 5.5, 5.5)
  )

grid.arrange(
  aucplot_gg_so, aucbw_so, aucdistplot_so, auprcplot_gg_so, auprcbw_so,
  auprcdistplot_so,
  nrow = 2, widths = c(.12, .08, .8), heights = c(.49, .51)
)

#+ plot-dev-dist, message = FALSE, warning = FALSE, fig.width = 7.5, fig.height = 3.5
devdist <- data.frame(
  perf[c("taxon", "ibra_subregion", "deviance_explained")],
  `Geographic (km)`= dists[as.character(perf$ibra_subregion)],
  Environmental = kldists[as.character(perf$ibra_subregion)],
  Community = community_dist[as.character(perf$ibra_subregion)],
  "Dummy" = "Dummy",
  check.names = FALSE
)

devdist <- gather(
  devdist, "distance", "value", `Geographic (km)`, Environmental, Community
)

devdist <- na.omit(devdist)

colnames(devdist)[3] <- "Deviance explained"

devdistmean <- summarise(
  group_by(devdist, ibra_subregion, distance),
  meandev = mean(`Deviance explained`), value = mean(value)
)

devdistplot <- ggplot(devdist) +
  aes(`Deviance explained`, value) +
  geom_vline(xintercept = 0, color = "grey", size = 1.5) +
  geom_point(alpha=.1) +
  geom_point(
    aes(meandev), data = devdistmean, size = 2, shape = 21, fill = "white"
  ) +
  scale_x_continuous(trans = "asinh", limits = c(-56, 1)) +
  ylab("Distance") +
  facet_grid(. ~ distance, scales = "free_x") +
  coord_flip() +
  theme_bw() +
  theme(
    axis.title.y = element_blank(),
    axis.text.y = element_blank(),
    axis.ticks.y = element_blank(),
    strip.background = element_blank()
  )

devbw <- ggplot(devdist) +
  aes(x = Dummy, y = `Deviance explained`) +
  geom_boxplot(fill = "grey") +
  facet_grid(. ~ Dummy) +
  scale_y_continuous(trans = "asinh", limits = c(-56, 1)) +
  scale_x_discrete(labels = "0.95") +
  xlab("Dummy") +
  theme_bw() +
  theme(
    axis.title.x = element_text(color = "transparent"),
    axis.text.x  = element_text(color = "transparent"),
    axis.ticks.x = element_line(color = "transparent"),
    strip.background = element_blank(),
    strip.text.x = element_text(color = "transparent"),
    plot.margin = margin(5.5, 0, 5.5, 5.5)
  )

grid.arrange(devbw, devdistplot, nrow = 1, widths = c(.14, .86))

#+ plot-auc, message = FALSE, warning = FALSE, fig.width = 3.5, fig.height = 8
ggplot(
  transform(
    subset(perf, ibra_subregion != "Greater Grampians" & !random_effect),
    ibra_subregion = factor(
      ibra_subregion, names(sort(tapply(AUC, ibra_subregion, median)))
    )
  )
) +
aes(AUC) + geom_histogram() +
facet_wrap(
  ~ibra_subregion, 6,
  labeller = as_labeller(
    function(x) {
      x <- gsub("-", "- ", x)
      x <- strwrap(x, width = 13, simplify = FALSE)
      vapply(x, paste, character(1), collapse = "\n")
    }
  )
) +
xlab("AUC") +
ylab("No. of taxa") +
theme_bw() +
theme(axis.text.x = element_text(size = 8))

#+ plot-auc-prev, message = FALSE, warning = FALSE, fig.width = 3.5, fig.height = 8
ggplot(
  transform(
    subset(
      left_join(perf, taxon_region_prevalence),
      ibra_subregion != "Greater Grampians" & !random_effect
    ),
    ibra_subregion = factor(
      ibra_subregion, names(sort(tapply(AUC, ibra_subregion, median)))
    )
  )
) +
aes(prevalence, AUC) +
geom_point(alpha = .5) +
facet_wrap(
  ~ibra_subregion, 6,
  labeller = as_labeller(
    function(x) {
      x <- gsub("-", "- ", x)
      x <- strwrap(x, width = 13, simplify = FALSE)
      vapply(x, paste, character(1), collapse = "\n")
    }
  )
) +
xlab("Prevalence") +
ylab("AUC") +
theme_bw() +
theme(axis.text.x = element_text(size = 8))

#+ plot-pred-info, message = FALSE, warning = FALSE, fig.width = 11
ggplot(
  transform(
    subset(perf, ibra_subregion != "Greater Grampians" & !random_effect),
    ibra_subregion = factor(
      ibra_subregion,
      names(sort(tapply(mean_predictive_info, ibra_subregion, median)))
    )
  )
) +
aes(mean_predictive_info) + geom_histogram() + facet_wrap(~ibra_subregion, 3) +
xlab("Mean Predictive Information") +
ylab("No. of taxa") +
ggtitle(
  paste(
    "Mean Predictive Information for taxa within IBRA subregions of",
    "Southeast Australia"
  )
)

#+ plot-grampians-taxa, message = FALSE, warning = FALSE, fig.width = 11
ggplot(
  transform(
    subset(perf, taxon %in% mdg$taxon & random_effect),
    taxon = factor(
      taxon,
      names(
        sort(
          tapply(
            AUC[ibra_subregion == "Greater Grampians"],
            taxon[ibra_subregion == "Greater Grampians"],
            median
          )
        )
      )
    )
  )
) +
aes(
  AUC,
  mean_predictive_info,
  col = ifelse(
    ibra_subregion == "Greater Grampians",
    "Grampians",
    "Other Region"
  )
) +
geom_point() +
facet_wrap(~taxon, labeller = label_wrap_gen(40)) +
labs(col = NULL) +
xlab("AUC") +
ylab("Mean Predictive Information") +
ggtitle(
  paste(
    ""
  )
) +
theme(legend.position = "bottom")

#+ modelauc, results = "hide", message = FALSE
#model_auc <- stan_glmer(
#  (AUC - .001) ~ value + (1 | ibra_subregion/taxon),
#  family = mgcv::betar,
#  iter = 1000, chains = 3,
#  data = aucdist,
#  subset = ibra_subregion != "Greater Grampians" & distance == "Geographic (km)"
#)

#+ summary_auc
#summary(model_auc, regex_pars = "^[^b]", probs = c(.025, .975))
