library(tidyverse)
library(glue)
library(pander)
library(BiocParallel)
library(yaml)
library(plyranges)
library(rtracklayer)
library(GenomeInfoDb)
library(Rsamtools)
library(patchwork)
library(ngsReports)
library(csaw)
library(edgeR)
library(scales)
library(magrittr)
library(statmod)
library(IHW)
library(ggrepel)
library(rlang)
library(ggside)
library(InteractionSet)
library(GenomicInteractions)
library(reactable)
library(htmltools)
library(msigdbr)
library(goseq)
library(extraChIPs)
library(tidygraph)
library(ggraph)
library(metap)
panderOptions("big.mark", ",")
panderOptions("missing", "")
panderOptions("table.split.table", Inf)
theme_set(
  theme_bw() +
  theme(
    plot.title = element_text(hjust = 0.5)
  )
)
register(MulticoreParam(workers = threads))
source(here::here("workflow", "scripts", "custom_functions.R"))

extra_params <- read_yaml(here::here("config", "params.yml"))
config <- read_yaml(here::here("config", "config.yml"))
macs2_path <- here::here("output", "macs2", target)
samples <- file.path(macs2_path, glue("{target}_qc_samples.tsv")) %>% 
  read_tsv() %>% 
  dplyr::filter(qc == "pass", treat %in% treat_levels) %>% 
  mutate(treat = factor(treat, levels = treat_levels)) %>% 
  droplevels()
stopifnot(nrow(samples) > 0)
rep_col <- setdiff(
  colnames(samples), c("sample", "treat", "target", "input", "label", "qc")
)
samples[[rep_col]] <- as.factor(samples[[rep_col]])
fdr_alpha <- config$comparisons$fdr
db_method <- match.arg(config$comparisons$method, c("sq-lt", "ls-ql"))
ihw_opts <- c("regions", "features", "targets")
ihw_method <- c(
  intersect(str_to_lower(config$comparisons$ihw), ihw_opts), "none"
)
ihw_method <- ihw_method[[1]]

bam_path <- here::here(config$paths$bam)
stopifnot(dir.exists(bam_path))
annotation_path <- here::here("output", "annotations")
stopifnot(dir.exists(annotation_path))
out_path <- here::here("output", "differential_binding", target)
if (!dir.exists(out_path)) dir.create(out_path, recursive = TRUE)

sq <- file.path(annotation_path, "seqinfo.rds") %>% 
  read_rds()
blacklist <- import.bed(here::here(config$external$blacklist), seqinfo = sq)
cb <- config$genome$build %>% 
  str_to_lower() %>% 
  paste0(".cytobands") 
data(list = cb)
bands_df <- get(cb)
stopifnot(
  colnames(bands_df) == c("chrom", "chromStart", "chromEnd", "name", "gieStain")
)

gtf_gene <- read_rds(file.path(annotation_path, "gtf_gene.rds"))
id2gene <- structure(gtf_gene$gene_name, names = gtf_gene$gene_id)
trans_models <- file.path(annotation_path, "trans_models.rds") %>% 
  read_rds() 

external_features <- GRanges(seqinfo = sq)
if (!is.null(config$external$features)) {
  external_features <- suppressWarnings(
    import.gff(here::here(config$external$features), genome = sq)
  )
  keep_cols <- !vapply(
    mcols(external_features), function(x) all(is.na(x)), logical(1)
  )
  mcols(external_features) <- mcols(external_features)[keep_cols]
}
has_features <- length(external_features) > 0
gene_regions <- file.path(annotation_path, "gene_regions.rds") %>% 
  read_rds()
regions <- vapply(gene_regions, function(x) unique(x$region), character(1))

rna_path <- here::here(config$external$rnaseq)
rnaseq <- tibble(gene_id = character())
if (length(rna_path) > 0) {
  stopifnot(file.exists(rna_path))
  if (str_detect(rna_path, "tsv$")) rnaseq <- read_tsv(rna_path)
  if (str_detect(rna_path, "csv$")) rnaseq <- read_csv(rna_path)
  if (!"gene_id" %in% colnames(rnaseq)) stop("Supplied RNA-Seq data must contain the column 'gene_id'")
  gtf_gene <- subset(gtf_gene, gene_id %in% rnaseq$gene_id)
  rna_lfc_col <- colnames(rnaseq)[str_detect(str_to_lower(colnames(rnaseq)), "logfc")][1]
  rna_fdr_col <- colnames(rnaseq)[str_detect(str_to_lower(colnames(rnaseq)), "fdr|adjp")][1]
}
has_rnaseq <- as.logical(nrow(rnaseq))
tx_col <- intersect(c("tx_id", "transcript_id"), colnames(rnaseq))
rna_gr_col <- ifelse(length(tx_col) > 0, "transcript_id", "gene_id")
rna_col <- c(tx_col, "gene_id")[[1]]

colours <- read_rds(here::here("output", "annotations", "colours.rds")) %>% 
  lapply(unlist)
region_colours <- setNames(colours$regions, regions[names(colours$regions)])
if (has_features)
  feature_colours <- setNames(
    colours$features, str_sep_to_title(names(colours$features))
  )
direction_colours <- colours$direction %>% 
  setNames(str_to_title(names(.))) %>% 
  .[names(.) %in% c("Up", "Down", "Unchanged", "Ambiguous")]
treat_colours <- colours$treat[treat_levels]
fig_dev <- knitr::opts_chunk$get("dev")
fig_type <- fig_dev[[1]]
if (is.null(fig_type)) stop("Couldn't detect figure type")

hic <- GInteractions()
hic_path <- here::here(config$external$hic)
if (length(hic_path) > 0)
  if (file.exists(hic_path)) {
    has_hic <- TRUE
    hic <- makeGenomicInteractionsFromFile(hic_path, type = "bedpe")
    reg_combs <- expand.grid(regions, regions) %>% 
      as.matrix() %>% 
      apply(
        MARGIN = 1, 
        function(x) {
          x <- sort(factor(x, levels = regions))
          paste(as.character(x), collapse = " - ")
        }
      ) %>% 
      unique()
    hic$regions <- anchors(hic) %>% 
      vapply(
        bestOverlap,
        y = GRangesList(lapply(gene_regions, granges)),
        character(length(hic))
      ) %>% 
      apply(MARGIN = 2, function(x) regions[x]) %>% 
      apply(
        MARGIN = 1, 
        function(x) {
          x <- sort(factor(x, levels = regions))
          paste(as.character(x), collapse = " - ")
        }
      ) %>% 
      factor(levels = reg_combs) %>%
      fct_relabel(
        str_replace_all,
        pattern = "Promoter \\([0-9kbp/\\+-]+\\)", replacement = "Promoter"
      )
    if (has_features) {
      feat_combs <- expand.grid(names(feature_colours), names(feature_colours)) %>% 
        as.matrix() %>% 
        apply(
          MARGIN = 1, 
          function(x) {
            x <- sort(factor(x, levels = names(feature_colours)))
            paste(as.character(x), collapse = " - ")
          }
        ) %>% 
        unique()
      hic$features <- vapply(
        anchors(hic),
        function(x) bestOverlap(
          x, external_features, var = "feature", missing = "no_feature"
        ),
        character(length(hic))
      )  %>% 
        apply(
          MARGIN = 1, 
          function(x) {
            x <- sort(factor(x, levels = names(feature_colours)))
            paste(as.character(x), collapse = " - ")
          }
        ) %>% 
        factor(levels = feat_combs) %>%
        fct_relabel(str_sep_to_title, pattern = "_")
    }
  }
stopifnot(is(hic, "GInteractions"))
seqlevels(hic) <- seqlevels(sq)
seqinfo(hic) <- sq
hic <- hic[!overlapsAny(hic, blacklist)]
has_hic <- as.logical(length(hic))

facet_labeller <- as_labeller(
  c(
    c(
      `TRUE` = "Union Peak",
      `FALSE` = "No Union Peak"
    ),
    structure(treat_levels, names = treat_levels),
    c(
      Up = glue("Increased {target} Binding"),
      Down = glue("Decreased {target} Binding")
    ),
    c(
      AveExpr = "logCPM", logFC = "logFC"
    )
  )
)

all_out <- list(
  results = file.path(
    out_path, 
    glue("{target}_{treat_levels[[1]]}_{treat_levels[[2]]}-differential_binding.rds")
  ),
  csv = file.path(
    out_path, 
    glue("{target}_{treat_levels[[1]]}_{treat_levels[[2]]}-differential_binding.csv.gz")
  ),
  windows  = file.path(
    out_path,
    glue("{target}_{treat_levels[[1]]}_{treat_levels[[2]]}-filtered_windows.rds")
  ),
  up_regions = file.path(
    out_path, glue("{target}_{treat_levels[[1]]}_{treat_levels[[2]]}-up.bed")
  ),
  down_regions = file.path(
    out_path, glue("{target}_{treat_levels[[1]]}_{treat_levels[[2]]}-down.bed")
  ),
  de_genes = file.path(
    out_path, glue("{target}_{treat_levels[1]}_{treat_levels[2]}-DE_genes.csv")
  ),
  enrichment = file.path(
    out_path, glue("{target}_{treat_levels[1]}_{treat_levels[2]}-enrichment.csv")
  ),
  rna_enrichment = file.path(
    out_path, 
    glue("{target}_{treat_levels[1]}_{treat_levels[2]}-rnaseq_enrichment.csv")
  ),
  renv = here::here(
    "output/envs",
    glue(
      "{target}_{treat_levels[[1]]}_{treat_levels[[2]]}-differential_binding.RData"
    )
  )
)
## Initialise outputs from RNA-Seq
cmn_diff <- cmn_up <- cmn_down <- tibble(leadingEdge = list())

Outline

Differential Binding

This step of the workflow uses a sliding window approach to differential binding, as a logical extension to the methods suggested and implemented in the csaw package (A. T. Lun and Smyth 2015).

Union Peaks derived from macs2 callpeak (Zhang et al. 2008) results are used to determine inclusion/exclusion values for sliding windows
Smooth Quantile Normalisation (Stephanie C. Hicks et al. 2017) is used on the set of logCPM values obtained from retained sliding windows. This was chosen given that many transcription factors can show vastly different cytoplasmic/nuclear distributions across treatments
The limma-trend method (Law et al. 2014) is used alongside treat (McCarthy and Smyth 2009) for detection of differential binding. This should correctly control the FDR, and allows the specification as a percentage of a suitable threshold for the estimated change in binding, below which we are not interested in any changed signal
Independent Hypothesis Weighting (IHW) (Ignatiadis et al. 2016) is additionally used to improve the power of the results. Under this strategy, p-values are partitioned based on the presence/absence of any other ChIP targets under consideration in any condition, which is considered here to be a statistically independent variable.

The workflow also depends heavily on the function implemented in the Bioconductor package extraChIPs

Enrichment Analysis

Beyond the simple analysis of differential binding, peaks are mapped to genes and enrichment testing is performed on the following:

Genes mapped to any window with detected AR are compared to all genes not mapped to any window
Genes mapped to all differentially bound windows are compared to genes mapped to windows which are not differentially bound
Genes mapped to windows with increasing AR binding are compared to genes mapped to windows which are not differentially bound
Genes mapped to windows with decreasing AR binding are compared to genes mapped to windows which are not differentially bound

Enrichment testing is performed using goseq (Young et al. 2010) with no term accounting for sampling bias, except when comparing genes mapped to any window. For this case only, gene width is used to capture any sampling bias and Wallenius’ Non-Central Hypergeometric Distribution is used. As RNA-seq data was provided, the genes considered for enrichment analysis are the 12,990 genes considered as detected in the RNA-Seq data.

Incorporation with RNA-Seq

Any association between differentially expressed genes and differentially bound sites will be assessed using Gene Set Enrichment Analysis (Subramanian et al. 2005), as implemented in the fgsea package (Korotkevich, Sukhov, and Sergushichev 2019). The sets of genes associated with changed binding will be subset by regions and any provided external features, and these novel gene-sets will be used to test for enrichment within the RNA-Seq results. ChIP-seq derived gene-sets will be tested for differential expression using genes ranked directionally and by significance alone.

Data Preparation

union_peaks <- file.path(macs2_path, glue("{target}_union_peaks.bed")) %>%
    import.bed(seqinfo = sq)

bfl <- bam_path %>%
  file.path(glue("{unique(c(samples$sample, samples$input))}.bam")) %>% 
  BamFileList() %>%
  setNames(c(samples$sample, unique(samples$input)))

ys <- 1000
bfl_summary <- bfl %>%
    bplapply(
        function(x) {
            yieldSize(x) <- ys
            sbp <- ScanBamParam(what = c("qwidth", "mapq"))
            open(x)
            vals <- scanBam(x, param = sbp)[[1]]
            close(x)
            list(as_tibble(vals))
        }
    ) %>% 
    as_tibble() %>%
    pivot_longer(everything(), names_to = "sample") %>%
    unnest(value) %>% 
    pivot_longer(
        cols = c("qwidth", "mapq"),
        names_to = "stat",
        values_to = "value"
    ) %>%
    left_join(samples, by = "sample") %>%
    mutate(
        across(
            all_of(c("target", "label", "treat")),
      \(x) str_replace_na(x, replacement = "Input")
        ),
        treat = factor(treat, levels = c("Input", treat_levels)),
        label = fct_inorder(label)
    ) %>% 
    split(f = .$stat)
anyDups <- bplapply(
    bfl,
    function(x) {
        sbp <- ScanBamParam(
            flag = scanBamFlag(isDuplicate = TRUE),
            which = GRanges(sq)[which.min(seqlengths(sq))],
            what = "qname"
            )
        length(scanBam(x, param = sbp)[[1]]$qname)  > 0
    }
) %>%
  unlist()
anyPE <- bplapply(
    bfl,
    function(x){
        yieldSize(x) <- ys
        open(x)
        flag <- scanBam(x, param=ScanBamParam(what="flag"))[[1]]$flag
        close(x)
        any(bamFlagTest(flag, "isPaired"))
    }
) %>% 
  unlist()

Taking the first 1,000 alignments, a brief inspection of the Bam Files revealed:

Aligned read lengths ranged between 50 and 76. The median length was 75.
MAPQ scores of aligned reads ranged between 2 and 42. The median score was 30.
All bam files have been de-duplicated
All alignments were single-end

Sliding Window Counts

greylist <- file.path(
  annotation_path, glue("{unique(samples$input)}_greylist.bed")
  ) %>%
  lapply(import.bed, seqinfo = sq) %>%
  GRangesList() %>%
  unlist() %>%
  reduce()

Prior to this workflow, high-signal regions were detected in any input samples associated with AR libraries and grey-lists formed. For these samples, this constituted 3,985 regions with total width of 82,913kb. Regions assigned to the greylist were added to the blacklisted regions and excluded from all analyses.

macs2_merged_logs <- file.path(
  macs2_path, glue("{target}_{treat_levels}_merged_callpeak.log")
    ) %>%
    importNgsLogs()
fl <- max(macs2_merged_logs$fragment_length)
rp <- readParam(
  pe = ifelse(any(anyPE), "both", "none"),
  dedup = any(anyDups),
  restrict = seqnames(sq),
  discard = c(granges(blacklist), greylist)
)
win_step <- 10*(1 + fl %/% 60)
win_size <- 3*win_step
window_counts <- windowCounts(
  bam.files = bfl,
  spacing = win_step,
  width = win_size,
  ext = fl,
  filter = length(bfl) - 1,
  param = rp,
  BPPARAM = bpparam()
)
colData(window_counts) <- colData(window_counts) %>% 
  as_tibble(rownames = "sample") %>% 
  dplyr::select(all_of(c("sample", "bam.files", "totals", "ext", "rlen"))) %>% 
  left_join(samples, by = "sample") %>% 
  mutate(
    treat = fct_explicit_na(treat, "Input"), 
    target = str_replace_na(target, "Input")
  ) %>% 
  as.data.frame() %>% 
  column_to_rownames("sample") %>% 
  DataFrame()
window_counts <- sortSeqlevels(window_counts)
seqinfo(window_counts) <- sq

Using the macs2-estimated fragment length of 188nt, a set of genomic sliding windows were defined using a window size of 120bp, sliding in increments of 40bp. With the exclusion of black-listed and grey-listed regions, all alignments within each window were counted for each AR-associated sample, and all relevant input samples. Any windows with fewer than 6 alignments (i.e.1 read/sample) across all samples were discarded, leaving a total of 62,349,360 sliding windows, covering 83% of the reference genome.

These windows were then filtered using the dualFilter() function from extraChIPs, discarding windows with low counts. Under this approach, two thresholds are determined above which windows are retained, and with values chosen to return 60% of sliding windows which overlap a macs2-derived union peak. These values are set to filter based on 1) signal relative to input over an extended range and, 2) overall signal level. Both filtering strategies rely on the infrastructure provided by csaw (A. T. L. Lun and Smyth 2014).

filtered_counts <- dualFilter(
  x = window_counts[, samples$sample],
  bg = window_counts[, samples$input],
  ref = union_peaks,
  keep.totals = TRUE,
  q = config$comparisons$filter_q
)
colData(filtered_counts) <- droplevels(colData(filtered_counts))
n_max <- min(5e4, nrow(filtered_counts))
sample_labeller <- as_labeller(
  setNames(colData(filtered_counts)$label, colnames(filtered_counts))
)

list(
  `Genomic Windows` = window_counts,
  `Retained Windows` = filtered_counts,
  `Union Peaks` = union_peaks
) %>% 
  lapply(granges) %>% 
  lapply(
    function(x) {
      tibble(
        N = comma(length(x)),
        `Total Width (kb)` = comma(sum(width(reduce(x))) / 1e3),
        `Median Width (bp)` = median(width(x))
      )
    }
  ) %>% 
  lapply(list) %>% 
  as_tibble() %>% 
  pivot_longer(everything(), names_to = "Dataset") %>% 
  unnest(everything()) %>% 
  left_join(
    tibble(
      Dataset = c("Retained Windows", "Union Peaks"),
      `Unique (kb)` = c(
        round(sum(width(setdiff(granges(filtered_counts), union_peaks))) / 1e3, 1),
        round(sum(width(setdiff(union_peaks, granges(filtered_counts)))) / 1e3, 1)
      ),
      `% Unique` = percent(
        1e3*`Unique (kb)` / c(
          sum(width(reduce(granges(filtered_counts)))),
          sum(width(union_peaks))
        ),
        0.1
      )
    )
  ) %>% 
  pander(
    justify = "lrrrrr",
    caption = paste(
      "A dual filtering strategy was used based on retaining", 
      percent(config$comparisons$filter_q, 0.1), 
      "of genomic windows overlapping the union peaks identified by ", 
      "`macs2 callpeak` on merged samples. This approach combined both ", 
      "1) expression percentiles, and 2) signal strength in relation to the input sample.",
      "The complete set of sliding windows covered the majority of the genome,",
      "whilst those retained after filtering were focussed on strong binding",
      "signal. Union peaks were as identified by `macs2 callpeak` in a", 
      "previous step of the workflow.",
      "Importantly, union peaks are non-overlapping, whilst the other two",
      "datasets are derived from overlapping sliding windows.",
      "This strategy of filtering the set of initial sliding windows retained",
      percent(
        1 - sum(width(setdiff(union_peaks, rowRanges(filtered_counts)))) / sum(width(union_peaks))
      ), 
      "of the genomic regions covered by union peaks, with", 
      percent(
        sum(width(setdiff(rowRanges(filtered_counts), union_peaks))) / sum(width(reduce(granges(filtered_counts)))))
      , 
      "of the retained windows being outside genomic regions covered by union peaks.",
      ifelse(
        n_max == 5e4, 
        "For subsequent density and RLE plots, a subsample of 50,000 genomic regions will be used for speed.", 
        ""
      )
    )
  )

A dual filtering strategy was used based on retaining 60.0% of genomic windows overlapping the union peaks identified by `macs2 callpeak` on merged samples. This approach combined both 1) expression percentiles, and 2) signal strength in relation to the input sample. The complete set of sliding windows covered the majority of the genome, whilst those retained after filtering were focussed on strong binding signal. Union peaks were as identified by `macs2 callpeak` in a previous step of the workflow. Importantly, union peaks are non-overlapping, whilst the other two datasets are derived from overlapping sliding windows. This strategy of filtering the set of initial sliding windows retained 73% of the genomic regions covered by union peaks, with 62% of the retained windows being outside genomic regions covered by union peaks. For subsequent density and RLE plots, a subsample of 50,000 genomic regions will be used for speed.
Dataset	N	Total Width (kb)	Median Width (bp)	Unique (kb)	% Unique
Genomic Windows	62,349,360	2,569,268	120
Retained Windows	98,742	5,066	120	3,131	61.8%
Union Peaks	7,597	2,652	317	716.7	27.0%

Normalisation

library(quantro)
library(qsmooth)

The quantro test was first applied (S. C. Hicks and Irizarry 2015) to determine if treatment-specific binding distributions were found in the data. Whilst this may not always be the case, the robustness of smooth-quantile normalisation (SQN) (Stephanie C. Hicks et al. 2017) will be applicable if data is drawn from different or near-identical distributions, and this method was applied grouping data by treatment.

qtest <- quantro(
  assay(filtered_counts, "logCPM"),
  groupFactor = filtered_counts$treat
)
pander(
  anova(qtest), 
  caption = paste(
    "*Results from qtest suggesting that the two treatment groups are drawn from",
    ifelse(
      qtest@anova$`Pr(>F)`[[1]] < 0.05,
      "different distributions.*",
      "the same distribution.*"
    )
  )
)

*Results from qtest suggesting that the two treatment groups are drawn from different distributions.*
	Df	Sum Sq	Mean Sq	F value	Pr(>F)
groupFactor	1	3.264	3.264	37.11	0.003671
Residuals	4	0.3518	0.08795

qs <-qsmooth(
  assay(filtered_counts, "logCPM"), group_factor = filtered_counts$treat
)
assay(filtered_counts, "qsmooth") <- qsmoothData(qs)

Data Inspection

QSmooth Weights

qsmoothPlotWeights(qs, xLab = "Quantiles", yLab = "Weights", mainLab = "QSmooth Weights")

Quantile-specific weights used by the Smooth-Quantile normalisation. Low weights indicate signal quantiles which appear to be more specific within a group, whilst higher weights indicate similarity between groups.

Density Plot: All Sliding Windows

plotAssayDensities(
  window_counts, trans = "log1p", n_max = n_max, colour = "treat",
  linetype = "target"
) +
  scale_colour_manual(values = colours$treat) + 
  labs(
    x = "log(counts + 1)", y = "Density", colour = "Treat", linetype = "Target"
  )

Density plot for all windows prior to the selection of windows more likely to contain true signal. Retained windows will be those at the upper end, whilst discarded windows will be at the lower end.

logCPM Densities

a <- filtered_counts %>% 
  plotAssayDensities("logCPM", colour = "treat", n_max = n_max) +
  scale_colour_manual(values = treat_colours) +
  labs(colour = "Treat")
b <- filtered_counts %>% 
  plotAssayDensities("qsmooth", colour = "treat", n_max = n_max) +
  scale_colour_manual(values = treat_colours) +
  labs(
    x = "Normalised logCPM", colour = "Treat"
  )
a + b + plot_layout(guides = "collect") + plot_annotation(tag_levels = "A")

logCPM distributions for all retained windows A) prior to smooth quantile normalisation, and 2) after SQ normalisation.

RLE (Pre-Normalisation)

filtered_counts %>% 
  plotAssayRle(
  "logCPM", fill = "treat", rle_group = "treat", n_max = n_max
) +
  geom_hline(yintercept = 0) + 
  facet_grid(.~treat, scales = "free_x", space = "free_x") +
  scale_x_discrete(labels = sample_labeller) +
  scale_fill_manual(values = treat_colours) +
  labs(x = "Sample", fill = "Treat")

RLE plot showing logCPM values. RLE values were calculated within each treatment group to account for the potentially different binding dynamics of AR.

RLE (Post-Normalisation)

filtered_counts %>% 
  plotAssayRle(
  "qsmooth", fill = "treat", rle_group = "treat", n_max = n_max
) +
  geom_hline(yintercept = 0) + 
  facet_grid(.~treat, scales = "free_x", space = "free_x") +
  scale_x_discrete(labels = sample_labeller) +
  scale_fill_manual(values = treat_colours) +
  labs(x = "Sample", fill = "Treat")

RLE plot showing normalised logCPM. RLE values were calculated within each treatment group to account for the potentially different binding dynamics of AR.

PCA

a <- plotAssayPCA(filtered_counts, "logCPM", colour = "treat", label = "label") +
  scale_colour_manual(values = treat_colours) +
  ggtitle("Pre-Normalisation")
b <- plotAssayPCA(filtered_counts, "qsmooth", colour = "treat", label = "label") +
  scale_colour_manual(values = treat_colours) +
  ggtitle("Post-Normalisation")
a + b + plot_layout(guides = "collect") + plot_annotation(tag_levels = "A")

PCA plots for logCPM values A) before and B) after Smooth Quantile normalisation

Differential Binding Analysis

Primary Analysis

X <- model.matrix(~treat, data = colData(filtered_counts)) %>%
  set_colnames(str_remove(colnames(.), "treat"))
colData(filtered_counts)$design <- X
paired_cors <- block <- txt <- NULL
if (config$comparisons$paired) {
  block <- colData(filtered_counts)[[rep_col]]
  set.seed(1e6)
  ind <- sample.int(nrow(filtered_counts), n_max, replace = FALSE)
  paired_cors <- duplicateCorrelation(
    object = assay(filtered_counts, "qsmooth")[ind, ],
    design = X,
    block = block
  )$consensus.correlation
  txt <- glue(
    "Data were nested within {{rep_col}} as a potential source of correlation. ",
    "The estimated correlation within replicate samples was $\\hat{\\rho} = {{round(paired_cors, 3)}}$",
    .open = "{{", .close = "}}"
  )
}
fc <- ifelse(is.null(config$comparisons$fc), 1, config$comparisons$fc)
fit <- assay(filtered_counts, "qsmooth") %>%
  lmFit(design = X, block = block, correlation = paired_cors) %>% 
  treat(fc = fc, trend = TRUE, robust = FALSE)
fit_mu0 <- assay(filtered_counts, "qsmooth") %>%
  lmFit(design = X, block = block, correlation = paired_cors) %>% 
  treat(fc = 1, trend = TRUE, robust = FALSE)
res_cols <- c("logFC", "AveExpr", "t", "P.Value", "fdr")
rowData(filtered_counts) <- rowData(filtered_counts) %>% 
  .[!colnames(.) %in% c(res_cols, "p_mu0")] %>% 
  cbind(
    fit %>% 
      topTable(sort.by = "none", number = Inf) %>% 
      as.list %>% 
      setNames(res_cols) %>% 
      c(
        list(
          p_mu0 = topTable(fit_mu0, sort.by = "none", number = Inf)$P.Value
        )
      )
  )

After SQ-normalisation of logCPM values, the limma-trend method (Law et al. 2014) was applied to all retained windows. A simple linear model was fitted taking E2 as the baseline and estimating the effects of E2DHT on AR binding within each sliding window, incorporating a trended prior-variance.

After model fitting, hypothesis testing was performed testing:

\[ H_0: -\lambda < \mu < \lambda \] against \[ H_A: |\mu| > \lambda \] where \(\mu\) represents the true mean change in AR binding. The value \(\lambda =\) 0.26 was chosen as this corresponds to a 20% change in detected signal on the log₂ scale. This is known as the treat method (McCarthy and Smyth 2009), and p-values were obtained for each initial window, before merging adjoining windows.

For subsequent classification when combining across multiple differential binding results, an additional p-value testing \(\mu = 0\) was added, however this is not used for detection of sites showing evidence of altered AR binding.

merged_results <- mergeByCol(
  filtered_counts, col = "AveExpr", merge_within = 2 * win_step,
  inc_cols = "p_mu0"
) %>% 
  mutate(
    overlaps_ref = overlapsAny(., union_peaks),
    direction = case_when(
      n_up == 0 & n_down == 0 ~ "Unchanged",
      logFC > 0 & n_up > n_down ~ "Up",
      logFC < 0 & n_up < n_down ~ "Down",
      TRUE ~ "Ambiguous"
    ) %>% 
      factor(levels = c("Up", "Down", "Ambiguous", "Unchanged")) %>% 
      droplevels()
  )

After selection of the 98,742 sliding windows (120bp), these were merged into 13,512 genomic regions with size ranging from 120 to 1640bp, with the median width being 360bp. Representative estimates of differential binding (i.e. logFC) were taken from the sliding window with the highest average signal across all samples, within each set of windows to be merged. Similarly, p-values from the above tests were also selected from the same sliding window as representative of the merged region (A. T. Lun and Smyth 2015). The number of windows showing increased or decreased binding within each merged region were also included, by counting those within each set of windows with p-values lower than the selected window. FDR-adjustment was performed with 9,738 regions (72.1%) showing significance, based on an FDR-adjusted p-value alone.

Independent Hypothesis Weighting

fdr_column <- str_subset(names(mcols(merged_results)), "fdr|FDR")
## Add status (direction) for the ihw = "none" possibility
merged_results <- merged_results %>% 
  mutate(
    status = case_when(
      !!sym(fdr_column) < fdr_alpha ~ as.character(direction),
      !!sym(fdr_column) >= fdr_alpha ~ "Unchanged"
    ) %>% 
      factor(levels = c(levels(direction), "Unchanged")) %>% 
      droplevels()
  )

other_targets <- here::here(config$samples$file) %>% 
  read_tsv() %>% 
  pull("target") %>% 
  unique() %>% 
  setdiff(target)
ihw_all <- other_targets %>% 
  sapply(
    function(x) {
      here::here(dirname(macs2_path), x, glue("{x}_union_peaks.bed")) %>%
        import.bed(seqinfo = sq)
    },
    simplify = FALSE
  ) %>% 
  lapply(granges) %>% 
  lapply(
    function(x) overlapsAny(merged_results, x)
  ) %>% 
  as_tibble() %>% 
  mutate(
    range = as.character(merged_results), id = seq_along(range)
  ) %>% 
  pivot_longer(
    cols = all_of(other_targets),
    names_to = "targets", 
    values_to = "overlap"
  ) %>% 
  group_by(id, range) %>% 
  summarise(
    targets = ifelse(
      any(overlap), 
      paste(sort(targets[overlap]), collapse = "+ "),
      "No Other Targets"
    ),
    .groups = "drop"
  ) %>% 
  arrange(id) %>% 
  mutate(targets = fct_infreq(targets)) %>% 
  .[["targets"]]
n_levels <- sum(table(ihw_all) > 1e3) - 1
merged_results$ihw_covariate <- fct_lump_n(ihw_all, n = n_levels)
ihw_proceed <- length(levels(merged_results$ihw_covariate)) > 1
if (ihw_proceed)  fdr_column <- "fdr_ihw"
if (ihw_proceed) {
  ihw <- ihw(
    pvalues = merged_results$P.Value,
    covariates = merged_results$ihw_covariate,
    alpha <- fdr_alpha,
    covariate_type = "nominal"
  )
  merged_results <- mutate(merged_results, fdr_ihw = adj_pvalues(ihw))
} 
merged_results <- merged_results %>% 
  mutate(
    status = case_when(
      !!sym(fdr_column) < fdr_alpha ~ as.character(direction),
      !!sym(fdr_column) >= fdr_alpha ~ "Unchanged"
    ) %>% 
      factor(levels = c(levels(direction), "Unchanged")) %>% 
      droplevels()
  )

Independent Hypothesis Weighting (IHW) (Ignatiadis et al. 2016) was then used to partition the raw p-values for AR differential binding by the detection of the other ChIP targets under investigation in this workflow (ER and H3K27ac). The presence of ER and H3K27ac was defined simply using the union peaks detected under any treatment by macs2 callpeak, as determined previously. This allows recalculation of the FDR using weighted p-values instead of raw p-values. In order for IHW to be a viable strategy, partitions should be greater than 1000. The provided union peaks were used as initial partitions in combination, merging the smallest groups below this size until all partitions were of a suitable size.

Windows were classified as overlapping a peak from a secondary ChIP target if any section of the window overlapped the secondary peak.

Summary Table

ihw %>%
  as.data.frame() %>% 
  mutate(
    fdr = p.adjust(pvalue, "fdr"),
    status = case_when(
      fdr < fdr_alpha & adj_pvalue > fdr_alpha ~ "Lost",
      fdr < fdr_alpha & adj_pvalue < fdr_alpha ~ "Retained",
      fdr > fdr_alpha & adj_pvalue < fdr_alpha ~ "Gained",
      fdr > fdr_alpha & adj_pvalue > fdr_alpha ~ "Never Significant"
    ) %>% 
      factor(
        levels = c("Never Significant", "Retained", "Gained", "Lost")
      )
  ) %>% 
  as_tibble() %>% 
  group_by(covariate, status) %>% 
  tally() %>% 
  ungroup() %>% 
  complete(status = status, covariate = covariate, fill = list(n = 0)) %>% 
  pivot_wider(
    names_from = status, values_from = n, values_fill = 0
  ) %>% 
  mutate(
    `Nett Change` = Gained - Lost
  ) %>% 
  bind_rows(
    summarise_if(., is.numeric, sum)
  ) %>% 
  mutate(
    covariate = str_replace_na(covariate, "Total"),
    Significant = Retained + Gained
  ) %>% 
  dplyr::select(
    Covariate = covariate, ends_with("Significant"), Retained, everything()
  ) %>% 
  pander(
    justify = "lrrrrrr",
    caption = paste(
      "*Summary of changes introduced by IHW for windows considered as being", 
      paste0("differentially bound by ", target, "."),
      "This corresponds to a __nett change of", 
        dplyr::slice(., nrow(.)) %>% 
        mutate(p = `Nett Change` / (Significant - `Nett Change`)) %>% 
        .[["p"]] %>% 
        percent(0.1),
      "from the initial list__.*"
    ),
    emphasize.strong.rows = nrow(.)
  )

*Summary of changes introduced by IHW for windows considered as being differentially bound by AR. This corresponds to a **nett change of 0.2% from the initial list**.*
Covariate	Never Significant	Significant	Retained	Gained	Lost	Nett Change
No Other Targets	1,684	4,790	4,790	0	46	-46
H3K27ac	1,051	3,306	3,300	6	5	1
Other	973	1,657	1,597	60	0	60
Total	3,708	9,753	9,687	66	51	15

Initial Group Sizes

tibble(ihw = ihw_all) %>% 
  ggplot(
    aes(ihw, fill = ihw)
  ) +
  geom_bar() +
  geom_hline(yintercept = 1e3, colour = "blue") +
  geom_text(
    aes(y = n + 0.025*max(n), label = comma(n, 1)),
    data = . %>%
      group_by(ihw) %>%
      summarise(n = dplyr::n(), .groups = "drop") 
  ) +
  scale_y_continuous(expand = expansion(c(0, 0.05)), labels = comma) +
  labs(x = "Peaks Overlapping Other ChIP Targets", y = "Total") +
  theme(legend.position = "none")

Breakdown of all windows which overlapped peaks from additional ChIP targets. Any partitions with fewer than 1000 windows (indicated as the blue horizontal line) were combined into the next smallest partition consecutively, until all partitions contained > 1000 windows.

P-Value Distributions

y <- ggplot_build(
  tibble(
    P.Value = merged_results$P.Value,
    ihw_covariate = merged_results$ihw_covariate
    ) %>% 
    ggplot(aes(P.Value, stat(density))) + 
    geom_histogram(bins = 100) +
    facet_wrap(~ihw_covariate) 
) %>% .[["data"]] %>% 
  .[[1]] %>% 
  .[["y"]] %>% 
  max()
merged_results %>% 
  select(P.Value, ihw_covariate) %>% 
  mcols() %>% 
  as_tibble() %>% 
  ggplot(aes(P.Value, stat(density))) +
  geom_histogram(fill = "grey", colour = "black", bins = 100) +
  geom_text(
    aes(0.8, y, label = lab),
    data = . %>% 
      group_by(ihw_covariate) %>% 
      summarise(n = dplyr::n(), .groups = "drop") %>% 
      mutate(
        lab = glue("N = {comma(n, 1)}"),
        y = y
      )
  ) +
  facet_wrap(~ihw_covariate) +
  scale_y_continuous(expand = expansion(c(0, 0.05))) +
  labs(x = "P Value", y = "Density")

P-Value distributions within all final data partitions. The size of each partition is given within each panel.

IHW Weights

plot(ihw) +
  geom_hline(yintercept = 1) +
  facet_wrap(~group) +
  scale_y_continuous(expand = expansion(c(0, 0.05))) +
  labs(x = "Fold", y = "Weight", fill = "Fold")

Weights applied to p-values within each partition. ‘Folds’ represent random sub-partitions within each larger partition generated as part of the IHW process.

FDR Comparison

ihw %>%
  as.data.frame() %>% 
  ggplot(
    aes(-log10(pvalue), -log10(weighted_pvalue), colour = fold)
  ) +
  geom_point(size = 0.4) +
  geom_hline(
    aes(yintercept = -log10(weighted_pvalue)),
    data = . %>% 
      dplyr::select(-group) %>% 
      dplyr::filter(adj_pvalue < fdr_alpha) %>% 
      dplyr::filter(weighted_pvalue == max(weighted_pvalue)),
    linetype = 2, colour = "blue"
  ) +
  geom_vline(
    aes(xintercept = -log10(pvalue)),
    data =. %>% 
      dplyr::filter(p.adjust(pvalue, "fdr") < fdr_alpha) %>% 
      dplyr::filter(pvalue == max(pvalue)) %>% 
      dplyr::select(-group),
    linetype = 2, colour = "blue"
  ) +
  facet_wrap(~group, scales = "free") +
  coord_cartesian(xlim = c(0, 5), ylim = c(0, 5)) +
  labs(
    x = expression(-log[10](p)),
    y = expression(-log[10](p[IHW])),
    colour = "Fold"
  )

Comparison of raw and weighted p-values for each partition. Blue dashed lines indicate FDR = 0.05 for each set of p-values. Those in the lower-right quadrant would no longer be considered significant after IHW, whilst those in the upper-left quadrant would only be considered as significant after the IHW process. Those in the upper-right quadrant would be considered as significant regardless of the methodology.

Mapping Windows

merged_results <- merged_results %>% 
  mutate(
    region = bestOverlap(
      ., 
      gene_regions %>% 
        lapply(select, region) %>% 
        GRangesList() %>% 
        unlist(),
      var = "region"
    ) %>% 
      factor(levels = regions)
  )
feat_prom <- GRanges()
feat_enh <- GRanges()
if (has_features) {
  merged_results <- merged_results %>% 
    mutate(
      feature = bestOverlap(
        ., external_features, "feature", missing = "no_feature"
      ) %>% 
        factor(levels = names(colours$features)) %>% 
        fct_relabel(str_sep_to_title)
    )
  if ("feature" %in% colnames(mcols(external_features))) {
    feat_prom <- external_features %>%
      subset(str_detect(feature, "[Pp]rom")) %>%
      granges()
    feat_enh <- external_features %>%
      subset(str_detect(feature, "[Ee]nhanc")) %>%
      granges()
  }
}
merged_results <- merged_results %>%
  mapByFeature(
    genes = gtf_gene, 
    prom = reduce(c(feat_prom, granges(gene_regions$promoter))),
    enh = feat_enh,
    gi = hic, 
    gr2gene = extra_params$mapping$gr2gene,
    prom2gene = extra_params$mapping$prom2gene,
    enh2gene = extra_params$mapping$enh2gene,
    gi2gene = extra_params$mapping$gi2gene
  )

In addition to statistical analysis, merged windows were first mapped to the gene-centric region with the largest overlap. Windows were then mapped to the external features provided in the same manner, followed by mapping to all annotated genes.

During mapping to genes, promoters were defined the union of all regions defined based on transcript-level annotation, and any external features which were defined as promoters. Enhancers were any regions defined in enhancer_atlas_2.0_zr75.gtf.gz as enhancers.

These features were used to map merged windows to features using the process defined in the function extraChIPs::mapByFeature():

Ranges overlapping a promoter are assigned to genes directly overlapping that specific promoter
Ranges overlapping an enhancer are assigned to all genes within 100 kb of the enhancer
Ranges overlapping a long-range interaction are assigned to all genes directly overlapping either end of the interaction
Ranges with no gene assignment from the previous steps are assigned to all overlapping genes, or the nearest gene within 100kb

Notably, genes are only passed to step 4 if no gene assignment has been made in steps 1, 2 or 3. For visualisation purposes, only genes which were considered as detected in any provided RNA-Seq data will be shown as the mapping targets.

Results

Result Tables

up_col <- function(x) {
  if (is.na(x) | is.nan(x)) return("#ffffff")
  rgb(
    colorRamp(c("#ffffff", colours$direction[["up"]]))(x), maxColorValue = 255
  )
}
down_col <- function(x) {
  if (is.na(x) | is.nan(x)) return("#ffffff")
  rgb(
    colorRamp(c("#ffffff", colours$direction[["down"]]))(x), maxColorValue = 255
  )
}
unch_col <- function(x) {
  if (is.na(x) | is.nan(x)) return("#ffffff")
  rgb(
    colorRamp(c("#ffffff", colours$direction[["unchanged"]]))(x), 
    maxColorValue = 255
  )
}
lfc_col <- function(x){
  if (is.na(x) | is.nan(x)) return("#ffffff")
  rgb(
    colorRamp(c(colours$direction[["down"]], "#ffffff", colours$direction[["up"]]))(x), 
    maxColorValue = 255
  )
}
expr_col <- function(x){
  if (is.na(x) | is.nan(x)) return("#ffffff")
  rgb(colorRamp(hcl.colors(9, "TealRose"))(x), maxColorValue = 255)
}
bar_style <- function(width = 1, fill = "#e6e6e6", height = "75%", align = c("left", "right"), color = NULL, fontSize = c()) {
  align <- match.arg(align)
  if (align == "left") {
    position <- paste0(width * 100, "%")
    image <- sprintf("linear-gradient(90deg, %1$s %2$s, transparent %2$s)", fill, position)
  } else {
    position <- paste0(100 - width * 100, "%")
    image <- sprintf("linear-gradient(90deg, transparent %1$s, %2$s %1$s)", position, fill)
  }
  styles <- list(
    backgroundImage = image,
    backgroundSize = paste("100%", height),
    backgroundRepeat = "no-repeat",
    backgroundPosition = "center",
    color = color
  )
  if (!is.null(fontSize)) styles$fontSize = fontSize
  styles
}
with_tooltip <- function(value, width = 30) {
  tags$span(title = value, str_trunc(value, width))
}
n_unch <- sum(merged_results$status == "Unchanged")
n_windows <- length(merged_results)
tbl_cols <- list(
  Region = colDef(
    footer = htmltools::tags$b("Total"), minWidth = 200
  ),
  Feature = colDef(
    footer = htmltools::tags$b("Total"), minWidth = 200
  ),
  Up = colDef(
    cell = function(value) comma(value, 1),
    style = function(value) {
      normalized <- value / (n_windows - n_unch)
      color <- up_col(normalized)
      list(background = color)
    },
    footer = function(values) htmltools::tags$b(comma(sum(values)))
  ),
  Down = colDef(
    cell = function(value) comma(value, 1),
    style = function(value) {
      normalized <- value / (n_windows - n_unch)
      color <- down_col(normalized)
      list(background = color)
    },
    footer = function(values) htmltools::tags$b(comma(sum(values)))
  ),
  Ambiguous = colDef(
    cell = function(value) comma(value, 1),
    style = function(value) {
      normalized <- value / (n_windows - n_unch)
      color <- unch_col(normalized)
      list(background = color)
    },
    footer = function(values) htmltools::tags$b(comma(sum(values)))
  ),
  Unchanged = colDef(
    cell = function(value) comma(value, 1),
    style = function(value) {
      normalized <- value / n_windows
      color <- unch_col(normalized)
      list(background = color)
    },
    footer = function(values) htmltools::tags$b(comma(sum(values)))
  ),
  `% Changed` = colDef(
    cell = function(value) percent(value, 0.1),
    style = function(value) bar_style(width = value, fill = "#B3B3B3", align = "right"),
    align = "right",
    footer = htmltools::tags$b(
      percent(sum(1 - n_unch/ n_windows), 0.1)
    )
  ),
  Total = colDef(
    cell = function(value) comma(value, 1),
    style = function(value) {
      bar_style(width = value / n_windows, fill = "#B3B3B3", align = "right")
    },
    align = "right",
    footer = function(values) htmltools::tags$b(comma(sum(values)))
  ),
  `% Of All Windows` = colDef(
    style = function(value) bar_style(width = value, fill = "#B3B3B3", align = "right"),
    format = colFormat(digits = 2, percent = TRUE)
  )
)

Overall Results

tbl <- merged_results %>% 
  mutate(w = width) %>% 
  as_tibble() %>% 
  group_by(status) %>% 
  summarise(
    n = dplyr::n(), 
    width = median(w),
    AveExpr = median(AveExpr),
    .groups = "drop"
  ) %>% 
  mutate(`%` = n / sum(n)) %>% 
  dplyr::select(status, n, `%`, everything()) %>% 
  reactable(
    searchable = FALSE, filterable = FALSE,
    columns = list(
      status = colDef(
        name = "Status", maxWidth = 150,
        footer = htmltools::tags$b("Overall")
      ),
      n = colDef(
        name = "Nbr of Windows", maxWidth = 150, format = colFormat(separators = TRUE),
        footer = htmltools::tags$b(comma(length(merged_results)))
      ),
      width = colDef(
        name = "Median Width (bp)", format = colFormat(digits = 1),
        footer = htmltools::tags$b(round(median(width(merged_results)), 1)),
        maxWidth = 200
      ),
      `%` = colDef(
        name = "% Total Windows", format = colFormat(percent = TRUE, digits = 1),
        style = function(value) bar_style(width = value, align = "right"),
        maxWidth = 150
      ),
      AveExpr = colDef(
        name = "Median Signal (logCPM)",
        format = colFormat(digits = 3),
        maxWidth = 200,
        footer = htmltools::tags$b(
          round(median(merged_results$AveExpr), 3)
        )
      )
    )
  )
cp <- htmltools::em(
  glue(
    "Overall results for changed {target} binding in the ",
    glue_collapse(rev(treat_levels), last = " Vs. "),
    " comparison."
  )
)
div(class = "table",
  div(class = "table-header",
      div(class = "caption", cp),
      tbl
  )
)

Overall results for changed AR binding in the E2DHT Vs. E2 comparison.

Summary By Region

df <- merged_results %>% 
  select(
    status, all_of(fdr_column), region
  ) %>% 
  as_tibble() %>% 
  group_by(region, status) %>% 
  summarise(n = dplyr::n(), .groups = "drop") %>% 
  complete(region, status, fill = list(n = 0)) %>% 
  group_by(region) %>% 
  mutate(Total = sum(n)) %>% 
  ungroup() %>% 
  pivot_wider(names_from = "status", values_from = "n") %>% 
  mutate(
    `% Of All Windows` = Total / length(merged_results),
    `% Changed` = 1 - Unchanged / Total
  ) %>% 
  arrange(region) %>% 
  dplyr::select(
    Region = region,
    any_of(names(direction_colours)),
    `% Changed`, Total, `% Of All Windows`
  )
cp <- htmltools::em(
  glue(
    "Overall results for changed {target} binding in the ",
    glue_collapse(rev(treat_levels), last = " Vs. "),
    " comparison, broken down by which genomic region contains the largest ", 
    "overlap with each merged window."
  )
)
tbl <- df %>%
  reactable(
    columns = tbl_cols[colnames(df)],
    defaultColDef = colDef(
      footer = function(values) htmltools::tags$b(comma(sum(values)))
    ),
    fullWidth = TRUE
  )
div(class = "table",
  div(class = "table-header",
      div(class = "caption", cp),
      tbl
  )
)

Overall results for changed AR binding in the E2DHT Vs. E2 comparison, broken down by which genomic region contains the largest overlap with each merged window.

Summary By Feature

df <- merged_results %>% 
  select(status, all_of(fdr_column), feature) %>% 
  as_tibble() %>% 
  group_by(feature, status) %>% 
  tally() %>% 
  ungroup() %>% 
  complete(feature, status, fill = list(n = 0)) %>% 
  group_by(feature) %>% 
  mutate(Total = sum(n)) %>% 
  ungroup() %>% 
  pivot_wider(names_from = "status", values_from = "n") %>% 
  mutate(
    `% Of All Windows` = Total / length(merged_results),
    `% Changed` = 1 - Unchanged / Total
  ) %>% 
  arrange(feature) %>% 
  dplyr::select(
    Feature = feature,
    any_of(names(direction_colours)),
    `% Changed`, Total, `% Of All Windows`
  )
cp <- htmltools::em(
  glue(
    "Overall results for changed {target} binding in the ",
    glue_collapse(rev(treat_levels), last = " Vs. "),
    " comparison, broken down by which external feature contains the largest ", 
    "overlap with each merged window."
  )
)
tbl <- df %>%
  reactable(
    columns = tbl_cols[colnames(df)],
    defaultColDef = colDef(
      footer = function(values) htmltools::tags$b(comma(sum(values)))
    )
  )
div(class = "table",
  div(class = "table-header",
      div(class = "caption", cp),
      tbl
  )
)

Overall results for changed AR binding in the E2DHT Vs. E2 comparison, broken down by which external feature contains the largest overlap with each merged window.

Most Highly Ranked

show_n <- min(200, length(merged_results))
scaling_vals <- list(
  logFC = c(-1, 1)*max(abs(merged_results$logFC)),
  AveExpr = range(merged_results$AveExpr),
  Width = max(width(merged_results))
)
cp <- htmltools::em(
  glue(
    "The {show_n} most highly-ranked windows by FDR, with ",
    sum(merged_results$P.Value_fdr < fdr_alpha), 
    " showing evidence of changed {target} binding. ",
    "Regions were assigned based on which genomic region showed the largest ",
    "overlap with the final merged window. ",
    ifelse(
      has_features, 
      glue("Features are as provided in the file {basename(config$external$features)}. "),
      glue("")
    ),
    "For windows mapped to large numbers of genes, hovering a mouse over the ", 
    "cell will reveal the full set of genes. ",
    ifelse(
      has_rnaseq,
      "Only genes considered as detected in the RNA-Seq data are shown. ",
      ""
    ),
    "The Macs2 Peak column is filterable using T/F values"
  )
)
fs <- 12
tbl <- merged_results %>%
  mutate(w = width) %>% 
  arrange(!!sym(fdr_column)) %>% 
  plyranges::slice(seq_len(show_n)) %>% 
  select(
    w, AveExpr, logFC, FDR = !!sym(fdr_column), 
    overlaps_ref, Region = region, any_of("feature"), Genes = gene_name
  ) %>% 
  as_tibble() %>% 
  dplyr::rename(
    Range = range, `Width (bp)` = w, `Macs2 Peak` = overlaps_ref
  ) %>% 
  rename_all(str_replace_all, "feature", "Feature") %>% 
  mutate(Genes = vapply(Genes, paste, character(1), collapse = "; ")) %>% 
  reactable(
    filterable = TRUE,
    showPageSizeOptions = TRUE,
    pageSizeOptions = c(10, 20, 50, show_n), defaultPageSize = 10,
    borderless  = TRUE,
    columns = list(
      Range = colDef(
        minWidth = 10 * fs,
        cell = function(value) {
          str_replace_all(value, ":", ": ")
        }
      ),
      `Width (bp)` = colDef(
        style = function(value) {
          x <- value / scaling_vals$Width
          colour <- expr_col(x)
          list(
            background = colour, 
            borderRight = "1px solid rgba(0, 0, 0, 0.1)"
          )
        },
        maxWidth = 5 * fs
      ),
      AveExpr = colDef(
        cell = function(value) round(value, 2),
        style = function(value) {
          x <- (value - min(scaling_vals$AveExpr)) / diff(scaling_vals$AveExpr)
          colour = expr_col(x)
          list(background = colour)
        },
        maxWidth = 5.5 * fs
      ),
      logFC = colDef(
        cell = function(value) round(value, 2),
        style = function(value) {
          x <- (value - min(scaling_vals$logFC)) / diff(scaling_vals$logFC)
          colour <- lfc_col(x)
          list(background = colour)
        },
        maxWidth = 5.5 * fs
      ),
      FDR = colDef(
        cell = function(value) sprintf("%.2e", value),
        style = list(borderRight = "1px solid rgba(0, 0, 0, 0.1)"),
        maxWidth = 5.5 * fs
      ),
      `Macs2 Peak` = colDef(
        cell = function(value) ifelse(value, "\u2714 Yes", "\u2716 No"),
        style = function(value) {
          color <- ifelse(value, "#008000", "#e00000")
          list(color = color, borderRight = "1px solid rgba(0, 0, 0, 0.1)")
        },
        maxWidth = 5 * fs
      ),
      Region = colDef(maxWidth = 150),
      Genes = colDef(
        cell = function(value) with_tooltip(value),
        minWidth = 11 * fs
      )
    ),
    theme = reactableTheme(style = list(fontSize = fs))
  )
div(class = "table",
  div(class = "table-header",
      div(class = "caption", cp),
      tbl
  )
)

The 200 most highly-ranked windows by FDR, with 9738 showing evidence of changed AR binding. Regions were assigned based on which genomic region showed the largest overlap with the final merged window. Features are as provided in the file enhancer_atlas_2.0_zr75.gtf.gz. For windows mapped to large numbers of genes, hovering a mouse over the cell will reveal the full set of genes. Only genes considered as detected in the RNA-Seq data are shown. The Macs2 Peak column is filterable using T/F values

Summary Plots

profile_width <- 5e3
n_bins <- 100
bwfl <- file.path(
    macs2_path, glue("{target}_{treat_levels}_merged_treat_pileup.bw")
  ) %>% 
  BigWigFileList() %>% 
  setNames(treat_levels) 
sig_ranges <- merged_results %>% 
  filter(!!sym(fdr_column) < fdr_alpha) %>% 
  colToRanges("keyval_range") %>% 
  splitAsList(f = .$direction) %>% 
  .[vapply(., length, integer(1)) > 0] 
fc_heat <- names(sig_ranges) %>% 
  lapply(
    function(x) {
      glue(
        "
        *Heatmap and histogram for all regions considered to show evidence of 
        {ifelse(x == 'Up', 'increased', 'decreased')} {target} binding in 
        response to {treat_levels[[2]]} treatment. A total of 
        {comma(length(sig_ranges[[x]]))} regions were in this group.
        ",
        ifelse(
          length(sig_ranges[[x]]) > 2e4, 
          " Due to the large number of regions, these were randomly down-sampled to 20,000 for viable plotting.",
          ""
        ),
        "*"
      )
    }
  ) %>% 
  setNames(names(sig_ranges))
sig_profiles <- lapply(sig_ranges, function(x) NULL)
for (i in names(sig_profiles)) {
  ## Restrict to 20,000. Plots work poorly above this number.
  ## Randomly sample
  set.seed(threads)
  temp_gr <- granges(sig_ranges[[i]])
  n <- length(temp_gr)
  if (n > 2e4) temp_gr <- temp_gr[sort(sample.int(n, 2e4))]
  sig_profiles[[i]] <- getProfileData(
    bwfl, temp_gr, upstream = profile_width / 2, bins = n_bins,
    BPPARAM = bpparam()
  )
  rm(temp_gr)
}
profile_heatmaps <- sig_profiles %>% 
  parallel::mclapply(
    plotProfileHeatmap,
    profileCol = "profile_data", 
    colour = "name",
    xLab = "Distance from Centre (bp)",
    fillLab = "logCPM",
    mc.cores = length(sig_profiles)
  ) 
fill_range <- profile_heatmaps %>% 
  lapply(function(x) x$data[,"score"]) %>% 
  unlist() %>%
  range()
sidey_range <- profile_heatmaps %>% 
  lapply(function(x) x$layers[[3]]$data$y) %>% 
  unlist() %>% 
  range()
profile_heatmaps <- profile_heatmaps %>% 
  lapply(
    function(x) {
      suppressWarnings(
        x + 
          scale_fill_gradientn(colours = colours$heatmaps, limits = fill_range) +
          scale_xsidey_continuous(limits = sidey_range) +
          scale_colour_manual(values = treat_colours) +
          labs(
            x = "Distance from Centre (bp)",
            fill = "logCPM", 
            colour = "Treat"
          ) +
          theme(
            strip.text.y = element_text(angle = 0)
          )
      )
    }
  )

MA Plot

if (!"Ambiguous" %in% levels(merged_results$status)) 
  direction_colours <- direction_colours[names(direction_colours) != "Ambiguous"]
merged_results %>% 
  as_tibble() %>%
  ggplot(aes(AveExpr, logFC)) +
  geom_point(aes(colour = status), alpha = 0.5) +
  geom_smooth(se = FALSE, method = "gam", formula = y ~ s(x, bs = "cs")) +
  geom_hline(yintercept = 0) +
  geom_label_repel(
    aes(y = 1.1*logFC, label = annot, colour = status),
    data= . %>%
      dplyr::filter(logFC == max(logFC) | logFC == min(logFC)) %>%
      mutate(
        detected = gene_name %>%
          lapply(paste, collapse = ", ") %>%
          unlist() %>%
          str_replace("^NA$", "") %>%
          str_wrap(width = 40),
        range = ifelse(detected == "", range, paste0(range, "\n")),
        annot = glue("{range}{detected}")
      ),
    fill = rgb(1, 1, 1, 0.2),
    size = 3,
    show.legend = FALSE
  ) +
  scale_colour_manual(values = direction_colours) +
  labs(
    x = "Ave Signal (logCPM)",
    colour = "Status"
  )

MA plot showing the status of each window under consideration. The two most extreme regions are labelled by region and any associated genes, whilst the overall pattern of association between signal level (logCPM) and changed signal (logFC) is shown as the blue curve.

Volcano Plot

merged_results %>% 
  as_tibble() %>% 
  ggplot(aes(logFC, -log10(P.Value))) +
  geom_point(aes(colour = status), alpha = 0.6) +
  geom_label_repel(
    aes(label = annot, colour = status),
    data= . %>%
      arrange(sign(logFC)*log10(P.Value)) %>% 
      dplyr::slice(1:2) %>% 
      dplyr::filter(logFC > 1) %>%
      mutate(
        detected = gene_name %>%
          lapply(paste, collapse = ", ") %>%
          unlist() %>%
          str_replace("^NA$", "") %>%
          str_wrap(width = 40),
        range = ifelse(detected == "", range, paste0(range, "\n")),
        annot = glue("{range}{detected}")
      ),
    fill = rgb(1, 1, 1, 0.2),
    size = 3,
    show.legend = FALSE
  ) +
  geom_label_repel(
    aes(label = annot, colour = status),
    data= . %>%
      arrange(-sign(logFC)*log10(P.Value)) %>% 
      dplyr::slice(1:2) %>% 
      dplyr::filter(logFC < -1) %>% 
      mutate(
        detected = gene_name %>%
          lapply(paste, collapse = ", ") %>%
          unlist() %>%
          str_replace("^NA$", "") %>%
          str_wrap(width = 40),
        range = ifelse(detected == "", range, paste0(range, "\n")),
        annot = glue("{range}{detected}")
      ),
    fill = rgb(1, 1, 1, 0.2),
    size = 3,
    show.legend = FALSE
  ) +
  scale_colour_manual(values = direction_colours) +
  labs(
    x = "logFC",
    y = expression(paste(-log[10], "p")),
    colour = "Status"
  )

Volcano plot showing regions with evidence of differential AR binding. The most significant regions are labelled along with any genes these regions are mapped to.

Heatmap: Increased Binding

profile_heatmaps$Up + guides(colour = "none")

Heatmap and histogram for all regions considered to show evidence of increased AR binding in response to E2DHT treatment. A total of 9,734 regions were in this group.

Heatmap: Decreased Binding

profile_heatmaps$Down + guides(colour = "none")

Heatmap and histogram for all regions considered to show evidence of decreased AR binding in response to E2DHT treatment. A total of 19 regions were in this group.

Results By Chromosome

merged_results %>% 
  select(status) %>% 
  as.data.frame() %>% 
  mutate(
    merge_status = fct_collapse(
      status,
      Up = "Up",
      Down = "Down",
      `Ambiguous/Unchanged` = intersect(c("Ambiguous", "Unchanged"), levels(status))
    )
  ) %>% 
  ggplot(aes(seqnames, fill = status)) +
  geom_bar() +
  facet_grid(merge_status~., scales = "free_y") +
  scale_y_continuous(expand = expansion(c(0, 0.05)), labels = comma) +
  scale_fill_manual(values =  direction_colours) +
  labs(
    x = "Chromosome", y = "Number of Windows", 
    fill = "Status", alpha = "Macs2 Peak"
  )

Results for differential binding separated by chromosome

Signal By Region

merged_results %>% 
  select(AveExpr, logFC, region) %>% 
  as_tibble() %>% 
  mutate(
    x = fct_relabel(
      region, str_replace_all, pattern = " \\(", replacement = "\n("
    ),
  ) %>% 
  pivot_longer(cols = c("AveExpr", "logFC")) %>% 
  ggplot(aes(x, value, fill = region)) +
  geom_boxplot() +
  facet_grid(
    name~., scales = "free", labeller = facet_labeller, switch = "y"
  ) +
  scale_fill_manual(values = region_colours) +
  labs(x = "Region", y = c(), fill = "Overlaps")

Distributions of overall signal (logCPM) and changed signal (logFC) for each genomic region.

Differential Binding By Region

merged_results %>% 
  subset(status %in% c("Up", "Down")) %>% 
  select(status, region) %>% 
  as_tibble() %>% 
  mutate(
    x = fct_relabel(
      region, str_replace_all, pattern = " \\(", replacement = "\n("
    )
  ) %>% 
  group_by(x, region, status) %>% 
  tally() %>% 
  arrange(region, desc(status)) %>% 
  mutate(y = cumsum(n)) %>% 
  ggplot(
    aes(x, n, fill = status)
  ) +
  geom_col() +
  geom_label(
    aes(y = y - 0.5*n, label = comma(n, 1), colour = status),
    data = . %>% ungroup() %>% dplyr::filter(n > 0.05*max(y)),
    fill = "white", alpha = 0.8,
    show.legend = FALSE
  ) +
  geom_text(
    aes(y = y1, label = comma(y, 1)),
    data = . %>% 
      dplyr::filter(y == max(y)) %>% 
      ungroup() %>% 
      mutate(y1 = y + 0.03*max(y))
  ) +
  scale_y_continuous(expand = expansion(c(0, 0.05)), labels = comma) +
  scale_fill_manual(values = direction_colours) +
  scale_colour_manual(values = direction_colours) +
  labs(x = "Region", y = "Merged Windows", fill = "Status")

Merged windows considered as showing differential AR binding across all genomic regions.

Signal By Feature

merged_results %>% 
  select(AveExpr, logFC, feature) %>% 
  as_tibble() %>% 
  pivot_longer(cols = c("AveExpr", "logFC")) %>% 
  ggplot(aes(feature, value, fill = feature)) +
  geom_boxplot() +
  facet_grid(
    name~., scales = "free", labeller = facet_labeller, switch = "y"
  ) +
  scale_fill_manual(values = feature_colours) +
  labs(x = "Region", y = c(), fill = "Overlaps")

Distributions of overall signal (logCPM) and changed signal (logFC) for each external features.

Differential Binding By Feature

merged_results %>% 
  subset(status %in% c("Up", "Down")) %>% 
  select(status, feature) %>% 
  as_tibble() %>% 
  group_by(feature, status) %>% 
  tally() %>% 
  arrange(feature, desc(status)) %>% 
  mutate(y = cumsum(n)) %>% 
  ungroup() %>% 
  ggplot(
    aes(feature, n, fill = status)
  ) +
  geom_col() +
  geom_label(
    aes(y = y - 0.5*n, label = comma(n, 1), colour = status),
    data = . %>%  dplyr::filter(n > 0.05*max(y)),
    fill = "white", alpha = 0.8,
    show.legend = FALSE
  ) +
  geom_text(
    aes(y = y1, label = comma(y, 1)),
    data = . %>% 
      group_by(feature) %>% 
      dplyr::filter(y == max(y)) %>% 
      ungroup() %>% 
      mutate(y1 = y + 0.03*max(y))
  ) +
  scale_y_continuous(expand = expansion(c(0, 0.05)), labels = comma) +
  scale_fill_manual(values = direction_colours) +
  scale_colour_manual(values = direction_colours) +
  labs(x = "Feature", y = "Merged Windows", fill = "Status")

Merged windows considered as showing differential AR binding across all external features.

Differential Binding By Region And Feature

df <- merged_results %>% 
  as_tibble() %>% 
  dplyr::filter(status %in% c("Up", "Down")) %>% 
  droplevels() %>% 
  group_by(region, feature, status) %>% 
  summarise(n = dplyr::n(), .groups = "drop_last") %>% 
  mutate(total = sum(n)) %>% 
  ungroup() %>% 
  mutate(
    region = fct_relabel(region, str_wrap, width = 20),
    x0 = as.integer(region),
    y0 = as.integer(feature),
    r = sqrt(total / sum(total)),
    r = 0.5* r / max(r)
  ) 
df %>% 
  ggplot() +
  ggforce::stat_pie(
    aes(x0 = x0, y0 = y0, r0 = 0, r = r, fill = status, amount = n)
  ) +
  coord_equal() +
  geom_label(
    aes(x0, y0, label = comma(total)),
    data = . %>% 
      distinct(x0, y0, total) %>% 
      dplyr::filter(total > 0.01 * sum(total)),
    alpha = 0.6,
    inherit.aes = FALSE,
    show.legend = FALSE
  ) +
  geom_xsidecol(aes(x = x0, y = n, fill = status)) +
  geom_ysidecol(aes(x = n, y = y0, fill = status)) +
  scale_x_continuous(
    breaks = seq_along(levels(df$region)), labels = levels(df$region)
  ) +
  scale_y_continuous(
    breaks = seq_along(levels(df$feature)), labels = levels(df$feature)
  ) +
  scale_fill_manual(values = direction_colours) +
  scale_xsidey_continuous(name = c(), expand = expansion(c(0, 0.1))) +
  scale_ysidex_continuous(name = c(), expand = expansion(c(0, 0.1))) +
  labs(
    x = "Region", y = "Feature", fill = "Status"
  ) +
  theme(
    legend.position = "right",
    ggside.panel.scale = 0.2
  )

Comparison of changes in AR binding separated by gene-centric region and external features provided in enhancer_atlas_2.0_zr75.gtf.gz

Inspection of Top-Ranked Regions

## Define a GRL with the key ranges.
## Then we can step through it & make all of the requisite plots
grl_to_plot <- GRangesList(
  top_up_by_fdr = merged_results %>% 
    filter(logFC > 0) %>% 
    filter(P.Value == min(P.Value)) %>% 
    filter(!!sym(fdr_column) < fdr_alpha),
  top_up_by_lfc = merged_results %>% 
    filter(logFC == max(logFC), logFC > 0, !!sym(fdr_column) < fdr_alpha),
  top_up_by_expr = merged_results %>% 
    filter(logFC > 0, !!sym(fdr_column) < fdr_alpha) %>% 
    filter(AveExpr == max(AveExpr)),
  top_up_no_macs2 = merged_results %>% 
    filter(logFC > 0, !!sym(fdr_column) < fdr_alpha, !overlaps_ref) %>% 
    arrange(!!sym(fdr_column)) %>% 
    arrange(P.Value),
  top_down_by_fdr = merged_results %>% 
    filter(logFC < 0) %>% 
    filter(P.Value == min(P.Value)) %>% 
    filter(!!sym(fdr_column) < fdr_alpha),
  top_down_by_lfc = merged_results %>% 
    filter(logFC == min(logFC), logFC < 0, !!sym(fdr_column) < fdr_alpha),
  top_down_by_expr = merged_results %>% 
    filter(logFC < 0, !!sym(fdr_column) < fdr_alpha) %>% 
    filter(AveExpr == max(AveExpr)),
  top_up_no_macs2 = merged_results %>% 
    filter(logFC < 0, !!sym(fdr_column) < fdr_alpha, !overlaps_ref) %>% 
    arrange(P.Value)
) %>% 
  unlist() %>% 
  .[!duplicated(.)] %>%
  splitAsList(names(.)) %>% 
  lapply(setNames, c()) %>% 
  lapply(function(x) x[1]) %>%
  GRangesList()
fh <- max(4, 1 + 3 * ceiling(length(grl_to_plot) / 3)) %>% 
  min(knitr::opts_chunk$get("fig.height"))

## The coverage
bwfl <- list2(
  "{target}" := file.path(
    macs2_path, glue("{target}_{treat_levels}_merged_treat_pileup.bw")
  ) %>% 
    BigWigFileList() %>% 
    setNames(treat_levels)
)
line_col <- list2("{target}" := treat_colours)
## Coverage annotations
annot <- merged_results %>% 
  splitAsList(.$status) %>% 
  .[vapply(., length, integer(1)) > 0] %>% 
  endoapply(granges) %>% 
  list() %>% 
  setNames(target)
## Coverage y-limits
ind <- filtered_counts %>% 
  assay("counts") %>% 
  apply(MARGIN = 2, which.max) %>%
  unique()
max_ranges <- rowRanges(filtered_counts[ind])
y_lim <- bwfl[[target]] %>%
  lapply(import.bw, which = max_ranges) %>%
  lapply(function(x) c(0, max(x$score))) %>% 
  unlist() %>% 
  range() %>% 
  list() %>% 
  setNames(target)

## The features track
feat_gr <- gene_regions %>% 
  lapply(granges) %>% 
  GRangesList()
feature_colours <- colours$regions
if (has_features) {
  feat_gr <- list(Regions = feat_gr)
  feat_gr$Features <- splitAsList(external_features, external_features$feature)
  feature_colours <- list(
    Regions = unlist(colours$regions),
    Features = unlist(colours$features)
  )
}

## The genes track
hfgc_genes <- trans_models
gene_col <- "grey"
if (has_rnaseq){
  if (!is.na(rna_lfc_col) & !is.na(rna_fdr_col)) {
    hfgc_genes <- trans_models %>% 
      mutate(
        status = case_when(
          !gene %in% rnaseq$gene_id ~ "Undetected",
          gene %in% dplyr::filter(
            rnaseq, !!sym(rna_lfc_col) > 0, !!sym(rna_fdr_col) < fdr_alpha
          )$gene_id ~ "Up",
          gene %in% dplyr::filter(
            rnaseq, !!sym(rna_lfc_col) < 0, !!sym(rna_fdr_col) < fdr_alpha
          )$gene_id ~ "Down",
          gene %in% dplyr::filter(
            rnaseq, !!sym(rna_fdr_col) >= fdr_alpha
          )$gene_id ~ "Unchanged",
        )
      ) %>% 
      splitAsList(.$status) %>% 
      lapply(select, -status) %>% 
      GRangesList()
    gene_col <- colours$direction %>% 
      setNames(str_to_title(names(.)))
  }
}

## External Coverage (Optional)
if (!is.null(config$external$coverage)) {
  ext_cov_path <- config$external$coverage %>% 
    lapply(unlist) %>% 
    lapply(function(x) setNames(here::here(x), names(x)))
  bwfl <- c(
    bwfl[target],
    lapply(ext_cov_path, function(x) BigWigFileList(x) %>% setNames(names(x)))
  )
  line_col <- c(
    line_col[target],
    ext_cov_path %>% 
      lapply(
        function(x) {
          missing <- setdiff(names(x), names(colours$treat))
          cmn <- intersect(names(x), names(colours$treat))
          col <- setNames(character(length(names(x))), names(x))
          if (length(cmn) > 0) col[cmn] <- colours$treat[cmn]
          if (length(missing) > 0) 
            col[missing] <- hcl.colors(
              max(5, length(missing)), "Zissou 1")[seq_along(missing)]
          col
        }
      )
  )
  
  y_ranges <- grl_to_plot %>% 
    unlist() %>% 
    granges() %>% 
    resize(w = 10 * width(.), fix = 'center')
  
  y_lim <- c(
    y_lim[target],
    bwfl[names(bwfl) != target] %>% 
      lapply(
        function(x) {
          GRangesList(lapply(x, import.bw, which = y_ranges)) %>% 
            unlist() %>% 
            filter(score == max(score)) %>% 
            mcols() %>% 
            .[["score"]] %>% 
            c(0) %>% 
            range()
        }
      )
  )
  
}

makeCaption <- function(.gr) {
  if (is.null(.gr)) return(NULL)
  dir <- ifelse(.gr$direction == "Down", 'decreased', 'increased')
  reg <- case_when(
    str_detect(.gr$region, "Inter") ~ paste("an", .gr$region, "region"),
    str_detect(.gr$region, "Upstream") ~ paste("an", .gr$region),
    str_detect(.gr$region, "(Ex|Intr)on") ~ paste("an", .gr$region),
    str_detect(.gr$region, "^Prom") ~ paste("a", .gr$region)
  )
  feat <- c()
  if (has_features) feat <- case_when(
    str_detect(.gr$feature, "^[AEIOU]") ~ paste("an", .gr$feature),
    !str_detect(.gr$feature, "^[AEIOU]") ~ paste("a", .gr$feature)
  )
  gn <- unlist(.gr$gene_name)
  fdr <- mcols(.gr)[[fdr_column]]
  fdr <- ifelse(
    fdr < 0.001, sprintf('%.2e', fdr), sprintf('%.3f', fdr)
  )
  cp <- c(
    glue(
      "*The {width(.gr)}bp region showing {dir} {target} binding in response to ", 
      "{treat_levels[[2]]} treatment (FDR = {fdr}). ",
      "The range mostly overlapped with {reg}, with all ",
      "defined regions shown as a contiguous bar. ",
      ifelse(
        has_features,
        glue(
          "Using the features supplied in {basename(config$external$features)}, ",
          "this mostly overlapped {feat}, shown as a separate block ",
          "with the gene-centric regions. "
        ),
        glue("")
      ),
    ),
    ifelse(
      .gr$overlaps_ref,
      paste(
        "A union peak overlapping this region was identified by",
        "`macs2 callpeak` when using merged samples."
      ),
      "No union peak was identified using `macs2 callpeak`."
    ),
    ifelse(
      length(gn) > 0,
      paste0(
        "Using the above mapping strategy, this range is likely to regulate ",
        collapseGenes(gn, format = "")
      ),
      "No genes were able to be assigned to this region."
    ),
    paste(
      "For each sample, the y-axis limits represent the values from the window", 
      "with the highest signal.*"
    )
  )
  paste(cp, collapse = " ")
}

Normalised Sliding Windows

grl_to_plot %>% 
  lapply(
    function(x){
      filtered_counts %>% 
        subsetByOverlaps(x) %>% 
        assay("qsmooth") %>% 
        as_tibble(rownames = "window") %>% 
        pivot_longer(
          cols = all_of(colnames(filtered_counts)),
          names_to = "sample",
          values_to = "logCPM"
        ) %>% 
        mutate(range = as.character(x)) %>% 
        list()
    }
  ) %>% 
  as_tibble() %>% 
  pivot_longer(everything()) %>% 
  unnest(everything()) %>% 
  mutate(
    direction = str_extract(name, "down|up"),
    by = str_extract(name, "by_.+"),
    across(
      all_of(c("name", "direction", "by")), str_sep_to_title
    ),
    name = str_remove_all(name, "Top ") %>% 
      str_replace_all("Lfc", "logFC") %>% 
      str_replace_all("Fdr", "FDR") %>% 
      str_replace_all("Expr", "Highest Signal") %>% 
      str_replace_all("By (.+)", "(\\1)") %>% 
      str_replace_all("No Macs2", "(No Macs2 Peak)"),
    window = as.integer(window)
  ) %>% 
  left_join(samples, by = "sample") %>% 
  ggplot(
    aes(window, logCPM, colour = treat, linetype = !!sym(rep_col))
  ) +
  geom_line(
    data = . %>% 
      group_by(name) %>% 
      dplyr::filter(max(window) > 1)
  ) + 
  geom_point(
    data = . %>% 
      group_by(name) %>% 
      dplyr::filter(max(window) == 1),
    show.legend = FALSE
  ) + 
  facet_wrap(~name + range, scales = "free_x") +
  scale_colour_manual(values = treat_colours) + 
  labs(
    x = "Sliding Window\n(Unmerged)", y = "SQ-Normalised logCPM", colour = "Treat",
    linetype = str_sep_to_title(rep_col)
  )

Most highly ranked ranges for both gained and decreased AR binding in repsonse to E2DHT treatment. The smooth-quantile normalised values are shown across the initial set of sliding windows before merging. Ranges were chosen as the most extreme for FDR, Binding Strength (Signal) and logFC. Windows are shown free of the genomic context.

Decreased Binding (Strongest Signal)

plotHFGC(
  grl_to_plot$top_down_by_expr,
  hic = hic,
  features = feat_gr,featcol = feature_colours, featsize = 1 + has_features,
  genes = hfgc_genes, genecol = gene_col,
  coverage = bwfl, linecol = line_col,
  annotation = annot, 
  annotcol = direction_colours,
  cytobands = bands_df,
  collapseTranscripts = FALSE,
  zoom = 10,
  max = 10 * width(grl_to_plot$top_down_by_expr),
  ylim = y_lim,
  col.title = "black", background.title = "white", showAxis = FALSE,
  rotation.title = 90
)

The 600bp region showing decreased AR binding in response to E2DHT treatment (FDR = 0.038). The range mostly overlapped with an Intergenic (>10kb) region, with all defined regions shown as a contiguous bar. Using the features supplied in enhancer_atlas_2.0_zr75.gtf.gz, this mostly overlapped an Enhancer, shown as a separate block with the gene-centric regions. A union peak overlapping this region was identified by macs2 callpeak when using merged samples. No genes were able to be assigned to this region. For each sample, the y-axis limits represent the values from the window with the highest signal.

Decreased Binding (Lowest FDR)

plotHFGC(
  grl_to_plot$top_down_by_fdr,
  hic = hic,
  features = feat_gr,featcol = feature_colours, featsize = 1 + has_features,
  genes = hfgc_genes, genecol = gene_col,
  coverage = bwfl, linecol = line_col,
  annotation = annot, 
  annotcol = direction_colours,
  cytobands = bands_df,
  collapseTranscripts = FALSE,
  zoom = 10,
  max = 10 * width(grl_to_plot$top_down_by_fdr),
  ylim = y_lim,
  col.title = "black", background.title = "white", showAxis = FALSE,
  rotation.title = 90
)

The 520bp region showing decreased AR binding in response to E2DHT treatment (FDR = 0.003). The range mostly overlapped with an Intron, with all defined regions shown as a contiguous bar. Using the features supplied in enhancer_atlas_2.0_zr75.gtf.gz, this mostly overlapped an Enhancer, shown as a separate block with the gene-centric regions. A union peak overlapping this region was identified by macs2 callpeak when using merged samples. No genes were able to be assigned to this region. For each sample, the y-axis limits represent the values from the window with the highest signal.

Decreased Binding (Most Extreme)

plotHFGC(
  grl_to_plot$top_down_by_lfc,
  hic = hic,
  features = feat_gr,featcol = feature_colours, featsize = 1 + has_features,
  genes = hfgc_genes, genecol = gene_col,
  coverage = bwfl, linecol = line_col,
  annotation = annot, 
  annotcol = direction_colours,
  cytobands = bands_df,
  collapseTranscripts = FALSE,
  zoom = 10, 
  max = 10 * width(grl_to_plot$top_down_by_lfc),
  ylim = y_lim,
  col.title = "black", background.title = "white", showAxis = FALSE,
  rotation.title = 90
)

The 120bp region showing decreased AR binding in response to E2DHT treatment (FDR = 0.011). The range mostly overlapped with an Intron, with all defined regions shown as a contiguous bar. Using the features supplied in enhancer_atlas_2.0_zr75.gtf.gz, this mostly overlapped a No Feature, shown as a separate block with the gene-centric regions. No union peak was identified using macs2 callpeak. Using the above mapping strategy, this range is likely to regulate DOCK2 For each sample, the y-axis limits represent the values from the window with the highest signal.

Increased Binding (Strongest Signal)

plotHFGC(
  grl_to_plot$top_up_by_expr,
  hic = hic,
  features = feat_gr,featcol = feature_colours, featsize = 1 + has_features,
  genes = hfgc_genes, genecol = gene_col,
  coverage = bwfl, linecol = line_col,
  annotation = annot, 
  annotcol = direction_colours,
  cytobands = bands_df,
  collapseTranscripts = FALSE,
  zoom = 10,
  max = 10 * width(grl_to_plot$top_up_by_expr),
  ylim = y_lim,
  col.title = "black", background.title = "white", showAxis = FALSE,
  rotation.title = 90
)

The 880bp region showing increased AR binding in response to E2DHT treatment (FDR = 0.001). The range mostly overlapped with an Intergenic (<10kb) region, with all defined regions shown as a contiguous bar. Using the features supplied in enhancer_atlas_2.0_zr75.gtf.gz, this mostly overlapped an Enhancer, shown as a separate block with the gene-centric regions. A union peak overlapping this region was identified by macs2 callpeak when using merged samples. Using the above mapping strategy, this range is likely to regulate DTX3L, HSPBAP1, PARP9 and PARP14 For each sample, the y-axis limits represent the values from the window with the highest signal.

Increased Binding (Lowest FDR)

plotHFGC(
  grl_to_plot$top_up_by_fdr,
  hic = hic,
  features = feat_gr,featcol = feature_colours, featsize = 1 + has_features,
  genes = hfgc_genes, genecol = gene_col,
  coverage = bwfl, linecol = line_col,
  annotation = annot, 
  annotcol = direction_colours,
  cytobands = bands_df,
  collapseTranscripts = FALSE,
  zoom = 10,
  max = 10 * width(grl_to_plot$top_up_by_fdr),
  ylim = y_lim,
  col.title = "black", background.title = "white", showAxis = FALSE,
  rotation.title = 90
)

The 1200bp region showing increased AR binding in response to E2DHT treatment (FDR = 1.08e-12). The range mostly overlapped with an Intron, with all defined regions shown as a contiguous bar. Using the features supplied in enhancer_atlas_2.0_zr75.gtf.gz, this mostly overlapped an Enhancer, shown as a separate block with the gene-centric regions. A union peak overlapping this region was identified by macs2 callpeak when using merged samples. Using the above mapping strategy, this range is likely to regulate CEP128 For each sample, the y-axis limits represent the values from the window with the highest signal.

Increased Binding (Most Extreme)

plotHFGC(
  grl_to_plot$top_up_by_lfc,
  hic = hic,
  features = feat_gr,featcol = feature_colours, featsize = 1 + has_features,
  genes = hfgc_genes, genecol = gene_col,
  coverage = bwfl, linecol = line_col,
  annotation = annot, 
  annotcol = direction_colours,
  cytobands = bands_df,
  collapseTranscripts = FALSE,
  zoom = 10,
  max = 10 * width(grl_to_plot$top_up_by_lfc),
  ylim = y_lim,
  col.title = "black", background.title = "white", showAxis = FALSE,
  rotation.title = 90
)

The 600bp region showing increased AR binding in response to E2DHT treatment (FDR = 3.82e-11). The range mostly overlapped with an Intron, with all defined regions shown as a contiguous bar. Using the features supplied in enhancer_atlas_2.0_zr75.gtf.gz, this mostly overlapped a No Feature, shown as a separate block with the gene-centric regions. A union peak overlapping this region was identified by macs2 callpeak when using merged samples. No genes were able to be assigned to this region. For each sample, the y-axis limits represent the values from the window with the highest signal.

Significant Increased Binding (No Union Peak)

plotHFGC(
  grl_to_plot$top_up_no_macs2,
  hic = hic,
  features = feat_gr,featcol = feature_colours, featsize = 1 + has_features,
  genes = hfgc_genes, genecol = gene_col,
  coverage = bwfl, linecol = line_col,
  annotation = annot, 
  annotcol = direction_colours,  
  cytobands = bands_df,
  collapseTranscripts = FALSE,
  zoom = 10,
  max = 10 * width(grl_to_plot$top_up_no_macs2),
  ylim = y_lim,
  col.title = "black", background.title = "white", showAxis = FALSE,
  rotation.title = 90
)

The 640bp region showing increased AR binding in response to E2DHT treatment (FDR = 4.37e-12). The range mostly overlapped with an Intron, with all defined regions shown as a contiguous bar. Using the features supplied in enhancer_atlas_2.0_zr75.gtf.gz, this mostly overlapped a No Feature, shown as a separate block with the gene-centric regions. No union peak was identified using macs2 callpeak. Using the above mapping strategy, this range is likely to regulate ATAD2 For each sample, the y-axis limits represent the values from the window with the highest signal.

Enrichment Testing

min_gs_size <- extra_params$enrichment$min_size
if (is.null(min_gs_size) | is.na(min_gs_size)) min_gs_size <- 0
max_gs_size <- extra_params$enrichment$max_size
if (is.null(max_gs_size) | is.na(max_gs_size)) max_gs_size <- Inf
min_sig <- extra_params$enrichment$min_sig
if (is.null(min_sig) | is.na(min_sig)) min_sig <- 1
msigdb <- msigdbr(species = "Homo sapiens") %>% 
  dplyr::filter(
    gs_cat %in% unlist(extra_params$enrichment$msigdb$gs_cat) |
      gs_subcat %in% unlist(extra_params$enrichment$msigdb$gs_subcat),
    str_detect(ensembl_gene, "^E")
  ) %>% 
  dplyr::rename(gene_id = ensembl_gene, gene_name = gene_symbol) %>% # For easier integration 
  dplyr::select(-starts_with("human"), -contains("entrez")) %>% 
  dplyr::filter(gene_id %in% gtf_gene$gene_id) %>% 
  mutate(gs_url = str_extract(gs_url, "^[^|]+")) %>% 
  group_by(gs_name) %>% 
  mutate(n = dplyr::n()) %>% 
  ungroup() %>% 
  dplyr::filter(n >= min_gs_size, n < max_gs_size) 
gs_by_gsid <- msigdb %>% 
  split(.$gs_name) %>% 
  lapply(pull, "gene_id") %>% 
  lapply(unique)
gs_by_geneid <- msigdb %>% 
  split(.$gene_id) %>% 
  lapply(pull, "gs_name") %>% 
  lapply(unique)
gs_url <- msigdb %>% 
  distinct(gs_name, gs_url) %>%
  mutate(
    gs_url = ifelse(
      gs_url == "", "
      http://www.gsea-msigdb.org/gsea/msigdb/collections.jsp",
      gs_url
    )
  ) %>% 
  with(setNames(gs_url, gs_name))

min_network_size <- extra_params$networks$min_size
if (is.null(min_network_size) | is.na(min_network_size))
  min_network_size <- 2

max_network_size <- extra_params$networks$max_size
if (is.null(max_network_size) | is.na(max_network_size))
  max_network_size <- Inf

max_network_dist <- extra_params$networks$max_distance
if (is.null(max_network_dist) | is.na(max_network_dist))
  max_network_dist <- 1

net_layout <- extra_params$networks$layout
enrich_alpha <- extra_params$enrichment$alpha
adj_method <- match.arg(extra_params$enrichment$adj, p.adjust.methods)
adj_desc <- case_when(
  p.adjust.methods %in% c("fdr", "BH") ~ "the Benjamini-Hochberg FDR",
  p.adjust.methods %in% c("BY") ~ "the Benjamini-Yekutieli FDR",
  p.adjust.methods %in% c("bonferroni") ~ "the Bonferroni",
  p.adjust.methods %in% c("holm") ~ "Holm's",
  p.adjust.methods %in% c("hommel") ~ "Hommel's",
  p.adjust.methods %in% c("hochberg") ~ "Hochberg's",
  p.adjust.methods %in% c("none") ~ "no"
) %>% 
  setNames(p.adjust.methods)
mapped_ids <- merged_results %>%
  as_tibble() %>% 
  dplyr::select(status, gene_id) %>%
  unnest(gene_id) %>%
  group_by(gene_id) %>%
  summarise(
    up = any(status == "Up"), 
    down = any(status == "Down"), 
    n_peaks = dplyr::n(),
    .groups = "drop"
  )

As an initial exploration, the retained windows were compared to pre-defined gene-sets. These were taken from the MSigDB database and the gene-sets used for enrichment testing were restricted to only include those with between 5 and 1000 genes.

msigdb %>% 
  distinct(gs_cat, gs_subcat, gs_name) %>%
  group_by(gs_cat, gs_subcat) %>% 
  summarise(`Gene Sets` = dplyr::n(),.groups = "drop") %>% 
  dplyr::rename(Category = gs_cat, `Sub-Category` = gs_subcat) %>% 
  mutate(
    Description = case_when(
      Category == "H" ~ "Hallmark Gene-Sets",
      Category == "C1" ~ "Positional Gene Sets",
      Category == "C2" ~ str_replace_all(`Sub-Category`, "CP:(.+)", "Curated Gene-Sets: \\1"),
      str_detect(`Sub-Category`, "TFT") ~ str_replace_all(`Sub-Category`, "TFT:(.+)", "Transcription Factor Target Prediction Gene-Sets: \\1"),
      str_detect(`Sub-Category`, "MIR") ~ str_replace_all(`Sub-Category`, "MIR:(.+)", "microRNA Target Gene-Sets: \\1"),
      str_detect(`Sub-Category`, "CGN") ~ "Cancer Gene Neighbourhoods",
      str_detect(`Sub-Category`, "CM") ~ "Cancer Modules",
      str_detect(`Sub-Category`, "GO:BP") ~ "Gene Ontology: Bological Process",
      str_detect(`Sub-Category`, "GO:MF") ~ "Gene Ontology: Molecular Function",
      str_detect(`Sub-Category`, "GO:CC") ~ "Gene Ontology: Cellular Component",
      str_detect(`Sub-Category`, "HPO") ~ "Human Phenotype Ontology",
      Category == "C6" ~ "Oncogenic Signature Gene Sets",
      `Sub-Category` == "IMMUNESIGDB" ~ "ImmuneSigDB Gene Sets",
      `Sub-Category` == "VAX" ~ "Vaccine Response Gene Sets",
      Category == "C8" ~ "Cell Type Signature Gene Sets"
    ),
    Category = factor(Category, levels = c("H", paste0("C", 1:8)))
  ) %>% 
  droplevels() %>% 
  arrange(Category) %>% 
  pander(
    justify = "llrl", 
    caption = "*Summary of Gene-Sets used for Enrichment Testing*"
  )

*Summary of Gene-Sets used for Enrichment Testing*
Category	Sub-Category	Gene Sets	Description
H		50	Hallmark Gene-Sets
C2	CP:KEGG	183	Curated Gene-Sets: KEGG
C2	CP:REACTOME	1,388	Curated Gene-Sets: REACTOME
C2	CP:WIKIPATHWAYS	605	Curated Gene-Sets: WIKIPATHWAYS
C3	TFT:GTRD	425	Transcription Factor Target Prediction Gene-Sets: GTRD

Retained windows were tested for enrichment of these gene-sets using:

Any window mapped to a gene from these gene-sets
Any window with evidence of differential binding mapped to a gene from these gene-sets

In the first case, mapped genes were tested for enrichment against all annotated genes, or all detected genes if RNA-Seq data was provided. In the second case, i.e. Differentially Bound Windows, the control set of genes were those mapped to a window (i.e. those test set from step 1). All adjusted p-values below are calculated using the Benjamini-Hochberg FDR adjustment. After adjustment, any enriched gene-sets with fewer than 3 mapped genes were excluded as being uninformative.

If 4 or more gene-sets were considered to be enriched, a network plot will be produced for that specific analysis, with network sizes capped at 80. The distances between gene-sets were calculated using the overlap coefficient (OC). Gene-sets with a large overlap will thus be given small distances and in the case of a complete overlap (\(OC = 1\)) the only most highly ranked gene-set was retained. Gene-set (i.e. node) pairs with a distance > 0.9 will not have edges drawn between them and edge width also corresponds to the distance between nodes with closely related nodes having thicker edges. All network plots were generated using the fr layout algorithm.

Windows With Detected AR

pwf_mapped <- gtf_gene %>% 
  mutate(w = width(.)) %>% 
  as_tibble() %>% 
  dplyr::select(gene_id, w) %>% 
  arrange(desc(w)) %>% 
  distinct(gene_id, .keep_all = TRUE) %>% 
  mutate(mapped = gene_id %in% unlist(merged_results$gene_id)) %>% 
  with(
    nullp(
      structure(mapped, names = gene_id),
      genome = config$genome$build,
      bias.data = log10(w),
      plot.fit = TRUE
    )
  )

goseq_mapped <- tibble(gs_name = character(), adj_p = numeric())
if (sum(pwf_mapped$DEgenes) > 0) {
  goseq_mapped <- goseq(
    pwf_mapped, config$genome$build, gene2cat = gs_by_geneid
  ) %>% 
    dplyr::rename(gs_name = category, pval = over_represented_pvalue) %>% 
    dplyr::select(-starts_with("under")) %>% 
    as_tibble() %>% 
    arrange(pval) %>% 
    dplyr::filter(numDEInCat > 0) %>% 
    mutate(adj_p = p.adjust(pval, adj_method)) %>% 
    dplyr::filter(numDEInCat >= min_sig)
}
any_goseq_mapped <- sum(goseq_mapped$adj_p < enrich_alpha) > 0
tg_mapped <- make_tbl_graph(
  goseq_mapped, 
  gs = gs_by_gsid %>% 
    lapply(intersect, rownames(subset(pwf_mapped, DEgenes))) %>% 
    .[vapply(., length, integer(1)) > 0]
)
plot_network_mapped <- length(tg_mapped) >= min_network_size

Results Table

cp <- htmltools::tags$caption(
  htmltools::em(
    glue(
      "
    All {sum(goseq_mapped$adj_p < enrich_alpha)} gene sets considered as 
    significantly enriched (p
    "
    ),
    htmltools::tags$sub("adj"),
    htmltools::em(
      glue(
        "
         < {enrich_alpha}) amongst the merged windows with detectable {target} 
        binding. Genes mapped to a merged window were compared to those not 
        mapped to any merged windows. Gene width was used to capture any 
        gene-level sampling bias.
        "
      )
    )
  )
)
tbl <- goseq_mapped %>%
  dplyr::filter(adj_p < enrich_alpha) %>% 
  left_join(
    dplyr::select(msigdb, gs_name, gene_name, gene_id, gs_url, gs_description),
    by = "gs_name"
  ) %>% 
  dplyr::filter(gene_id %in% unlist(merged_results$gene_id)) %>% 
  dplyr::select(-gene_id) %>% 
  chop(gene_name) %>% 
  mutate(
    gene_name = vapply(gene_name, paste, character(1), collapse = "; "),
    `%` = numDEInCat / numInCat
  ) %>% 
  dplyr::select(
    `Gene Set` = gs_name, Description = gs_description,
    `%`, numDEInCat, numInCat, 
    p = pval, adj_p, 
    `Mapped Genes` = gene_name
  ) %>% 
  reactable(
    filterable = TRUE,
    showPageSizeOptions = TRUE,
    columns = list(
      "Gene Set" = colDef(
        minWidth = 150,
        cell = function(value) htmltools::tags$a(
          href = gs_url[[value]], 
          target = "_blank", 
          str_replace_all(value, "_", " ")
        ),
        html = TRUE
      ),
      Description = colDef(
        minWidth = 150,
        cell = function(value) with_tooltip(value, width = 100)
      ),
      numDEInCat = colDef(name = "Total Mapped", maxWidth = 80),
      numInCat = colDef(
        name = "Gene Set Size",
        maxWidth = 80,
        cell = function(value) comma(value, 1)
      ),
      "%" = colDef(
        name = "% Mapped",
        cell = function(value) percent(value, 0.1),
        maxWidth = 80
      ),
      p = colDef(
        cell = function(value) {
          ifelse(
            value < 0.001, sprintf("%.2e", value), sprintf("%.3f", value)
          )
        },
        maxWidth = 80
      ),
      adj_p = colDef(
        name = "p<sub>adj</sub>", html = TRUE,
        cell = function(value) {
          ifelse(
            value < 0.001, sprintf("%.2e", value), sprintf("%.3f", value)
          )
        },
        maxWidth = 80
      ),
      "Mapped Genes" = colDef(
        cell = function(value) with_tooltip(value, width = 100),
        minWidth = 150
      )
    )
  )
div(class = "table",
  div(class = "table-header",
      div(class = "caption", cp),
      tbl
  )
)

All 11 gene sets considered as significantly enriched (p _adj < 0.05) amongst the merged windows with detectable AR binding. Genes mapped to a merged window were compared to those not mapped to any merged windows. Gene width was used to capture any gene-level sampling bias.

Network Plot

tg_mapped %>% 
  ggraph(layout = net_layout, weights = oc^2) +
  geom_edge_link(aes(width = oc^2, alpha = oc^2)) +
  geom_node_point(
    aes(fill = -log10(pval), size = numDEInCat),
    shape = 21
  ) +
  geom_node_text(
    aes(label = label),
    colour = "black", size = 3, 
    data = . %>%
      mutate(
        label = str_replace_all(label, "_", " ") %>% str_trunc(60) %>% str_wrap(width = 18)
      ),
    repel = TRUE, max.overlaps = max(10, round(length(tg_mapped) / 4, 0)),
    bg.color = "white", bg.r = 0.1, 
  ) +
  scale_x_continuous(expand = expansion(c(0.1, 0.1))) +
  scale_y_continuous(expand = expansion(c(0.1, 0.1))) +
  scale_fill_viridis_c(option = "inferno", begin = 0.25) +
  scale_size_continuous(range = c(1, 10)) +
  scale_edge_width(range = c(1, 6), limits = c(0, 1)) +
  scale_edge_alpha(range = c(0.1, 0.4), limits = c(0, 1))  +
  guides(edge_alpha= "none") +
  labs(size = "Mapped Genes", edge_width = expr(paste(OC^2))) +
  theme_void()

Network plot showing gene-sets enriched amongst the overall set of sites with a binding site for AR.

Windows With Differentially Bound AR

pwf_diff <- tibble(gene_id  = unique(gtf_gene$gene_id)) %>% 
  left_join(mapped_ids, by = "gene_id") %>% 
  dplyr::filter(!is.na(n_peaks)) %>% 
  mutate(diff = up|down) %>% 
  with(
    nullp(
      structure(diff, names = gene_id),
      genome = config$genome$build,
      bias.data = log1p(n_peaks),
      plot.fit = FALSE
    )
  )
goseq_diff <- tibble(gs_name = character(), pval = numeric(), adj_p = numeric())
if (sum(pwf_diff$DEgenes) > 0) {
  goseq_diff <- goseq(
    pwf_diff, gene2cat = gs_by_geneid, method = "Hypergeometric"
  ) %>% 
    dplyr::rename(gs_name = category, pval = over_represented_pvalue) %>% 
    dplyr::select(-starts_with("under")) %>% 
    as_tibble() %>% 
    arrange(pval) %>% 
    dplyr::filter(numDEInCat > 0) %>% 
    mutate(adj_p = p.adjust(pval, adj_method)) %>% 
    dplyr::filter(numDEInCat >= min_sig)
}
any_goseq_diff <- sum(goseq_diff$adj_p < enrich_alpha) > 0
tg_diff <- make_tbl_graph(
  goseq_diff, 
  gs = gs_by_gsid %>% 
    lapply(intersect, rownames(subset(pwf_diff, DEgenes))) %>%
    .[vapply(., length, integer(1)) > 0]
)
plot_network_diff <- length(tg_diff) >= min_network_size

No enrichment was found amongst genes mapped to sites showing any differential binding

Windows With Increased AR Binding

pwf_up <- tibble(gene_id  = unique(gtf_gene$gene_id)) %>% 
  left_join(mapped_ids, by = "gene_id") %>% 
  dplyr::filter(!is.na(n_peaks)) %>% 
  mutate(diff = up) %>% 
  with(
    nullp(
      structure(diff, names = gene_id),
      genome = config$genome$build,
      bias.data = log1p(n_peaks),
      plot.fit = FALSE
    )
  )
goseq_up <- tibble(gs_name = character(), pval = numeric(), adj_p = numeric())
if (sum(pwf_up$DEgenes) > 0) {
  goseq_up <- goseq(
    pwf_up, gene2cat = gs_by_geneid, method = "Hypergeometric"
  ) %>% 
    dplyr::rename(gs_name = category, pval = over_represented_pvalue) %>% 
    dplyr::select(-starts_with("under")) %>% 
    as_tibble() %>% 
    arrange(pval) %>% 
    dplyr::filter(numDEInCat > 0) %>% 
    mutate(adj_p = p.adjust(pval, adj_method)) %>% 
    dplyr::filter(numDEInCat >= min_sig)
}
any_goseq_up <- sum(goseq_up$adj_p < enrich_alpha) > 0
tg_up <- make_tbl_graph(
  goseq_up, 
  gs = gs_by_gsid %>% 
    lapply(intersect, rownames(subset(pwf_up, DEgenes))) %>%
    .[vapply(., length, integer(1)) > 0]
)
plot_network_up <- length(tg_up) >= min_network_size

No enrichment was found amongst genes mapped to sites showing increased AR binding

Windows With Decreased AR Binding

pwf_down <- tibble(gene_id  = unique(gtf_gene$gene_id)) %>% 
  left_join(mapped_ids, by = "gene_id") %>% 
  dplyr::filter(!is.na(n_peaks)) %>% 
  mutate(diff = down) %>% 
  with(
    nullp(
      structure(diff, names = gene_id),
      genome = config$genome$build,
      bias.data = log1p(n_peaks),
      plot.fit = FALSE
    )
  )
goseq_down <- tibble(gs_name = character(), pval = numeric(), adj_p = numeric())
if (sum(pwf_down$DEgenes) > 0) {
  goseq_down <- goseq(
    pwf_down, config$genome$build, gene2cat = gs_by_geneid, method = "Hypergeometric"
  ) %>% 
    dplyr::rename(gs_name = category, pval = over_represented_pvalue) %>% 
    dplyr::select(-starts_with("under")) %>% 
    as_tibble() %>% 
    arrange(pval) %>% 
    dplyr::filter(numDEInCat > 0) %>% 
    mutate(adj_p = p.adjust(pval, adj_method)) %>% 
    dplyr::filter(numDEInCat >= min_sig)
}
any_goseq_down <- sum(goseq_down$adj_p < enrich_alpha) > 0
tg_down <- make_tbl_graph(
  goseq_down, 
  gs = gs_by_gsid %>% 
    lapply(intersect, rownames(subset(pwf_down, DEgenes))) %>%
    .[vapply(., length, integer(1)) > 0]
)
plot_network_down <- length(tg_down) >= min_network_size

No enrichment was found amongst genes mapped to sites showing decreased AR binding

Comparison With RNA Seq

library(fgsea)
library(metap)
rna_status <- rnaseq %>% 
  mutate(
    de_status = case_when(
      !!sym(rna_fdr_col) < fdr_alpha & !!sym(rna_lfc_col) < 0 ~ "Down",
      !!sym(rna_fdr_col) < fdr_alpha & !!sym(rna_lfc_col) > 0 ~ "Up",
      !!sym(rna_fdr_col) >= fdr_alpha ~ "Unchanged"
    )
  ) %>% 
  dplyr::select(gene_id, gene_name, de_status) %>% 
  bind_rows(
    gtf_gene %>% 
      subset(!gene_id %in% rnaseq$gene_id) %>% 
      select(gene_id, gene_name) %>% 
      mcols() %>% 
      as_tibble() %>% 
      mutate(de_status = "Undetected")
  ) %>% 
  mutate(
    de_status = factor(
      de_status, levels = str_to_title(names(colours$direction))
    )
  ) %>% 
  arrange(gene_id)
rna_p_col <- colnames(rnaseq)[
  str_detect(str_to_lower(colnames(rnaseq)), "pval|p\\.val")
][1]
stopifnot(length(rna_p_col) == 1)
diff_windows <- merged_results %>% 
  splitAsList(.$status) %>% 
  lapply(select, gene_id) %>% 
  lapply(mcols) %>% 
  lapply(as_tibble) %>% 
  lapply(pull, "gene_id") %>% 
  lapply(unlist) %>% 
  lapply(unique)

Distribution Of Detected Genes

Detected Genes By AR Binding Region

The distribution of merged windows, where AR was considered as present, was firstly compared to genes within the RNA-Seq dataset. Genes were simply considered as detected if present in the RNA-Seq data, or undetected if not present. Gene-centric regions were as defined previously.

w <- 0.8
min_p <- 0.025
merged_results %>%
  mutate(
    any_detected = ifelse(
      vapply(gene_name, length, integer(1)) > 1,
      "Mapped to Detected Genes",
      "Mapped to Undetected Genes"
    )
  ) %>% 
  plotPie(fill = "region", x = "any_detected", label_size = 4, width = w) +
  geom_text(
    aes(xlab, ylab, label =lab),
    data = . %>% 
      mutate(
        xlab = x + w*(r + 0.1)*sin(label_radians),
        ylab = 1 + w*(r + 0.1)*cos(label_radians),
        lab = glue("{region}\n{percent(p, 0.1)}") %>% 
          str_wrap(10)
      ) %>% 
      dplyr::filter(p * N > min_p * max(N)),
    size = 3
  ) +
  geom_segment(
    aes(x_inner, y_inner, xend = x_outer, yend = y_outer),
     data = . %>% 
      mutate(
        x_inner = x + w*r*sin(label_radians),
        y_inner = 1 + w*r*cos(label_radians),
        x_outer = x + w*(r + 0.01)*sin(label_radians),
        y_outer = 1 + w*(r + 0.01)*cos(label_radians),
      ) %>% 
      dplyr::filter(p * N > min_p * max(N))
  ) +
  scale_fill_manual(
    values = colours$regions %>% 
      setNames(regions[names(.)]) 
  ) +
  labs(x = "", fill = "Region") +
  theme(panel.grid = element_blank())

Distribution of AR-bound windows by region, according to whether the window is mapped to a detected gene in the RNA-Seq dataset.

Detected Genes By Feature

merged_results %>% 
  mutate(
    any_detected = ifelse(
      vapply(gene_name, length, integer(1)) > 1,
      "Mapped to Detected Genes",
      "Mapped to Undetected Genes"
    )
  ) %>% 
  plotPie(fill = "feature", x = "any_detected", label_size = 4, width = w) +
  geom_text(
    aes(xlab, ylab, label =lab),
    data = . %>% 
      mutate(
        xlab = x + w*(r + 0.1)*sin(label_radians),
        ylab = 1 + w*(r + 0.1)*cos(label_radians),
        lab = glue("{feature}\n{percent(p, 0.1)}") %>% 
          str_wrap(10)
      ) %>% 
      dplyr::filter(p * N > min_p * max(N)),
    size = 3
  ) +
  geom_segment(
    aes(x_inner, y_inner, xend = x_outer, yend = y_outer),
     data = . %>% 
      mutate(
        x_inner = x + w*r*sin(label_radians),
        y_inner = 1 + w*r*cos(label_radians),
        x_outer = x + w*(r + 0.01)*sin(label_radians),
        y_outer = 1 + w*(r + 0.01)*cos(label_radians),
      ) %>% 
      dplyr::filter(p * N > min_p * max(N))
  ) +
  scale_fill_manual(
    values = colours$features %>% 
      setNames(str_sep_to_title(names(.)))
  ) +
  labs(x = "", fill = "External Feature") +
  theme(panel.grid = element_blank())

Distribution of AR-bound windows by externally-defined features from enhancer_atlas_2.0_zr75.gtf.gz, according to whether the window is mapped to a detected gene in the RNA-Seq dataset.

Relationship With Differentially Expressed Genes

rna_dir_ranks <- rnaseq %>% 
  mutate(
    ranking_stat = -sign(!!sym(rna_lfc_col))*log10(!!sym(rna_p_col)) %>% 
      setNames(gene_id)
  ) %>% 
  arrange(ranking_stat) %>% 
  dplyr::filter(!!sym(rna_p_col) < 1) %>% 
  pull("ranking_stat") 
rna_sig_ranks <- rna_dir_ranks %>% 
  abs() %>% 
  sort()
dir_gs <- merged_results %>% 
  select(status, starts_with("gene")) %>% 
  as_tibble() %>% 
  mutate(group = glue("{target} {status}")) %>% 
  split(.$group)%>% 
  lapply(pull, "gene_id") %>% 
  lapply(unlist) %>% 
  lapply(unique) %>% 
  .[vapply(., length, integer(1)) >= min_gs_size]
reg_dir_gs <- merged_results %>% 
  select(
    overlaps_ref, status, any_of(c("region", "feature")), starts_with("gene")
  ) %>% 
  as_tibble() %>% 
  mutate(group = glue("{region} / {target} {status}")) %>% 
  split(.$group)%>% 
  lapply(pull, "gene_id") %>% 
  lapply(unlist) %>% 
  lapply(unique) %>% 
  .[vapply(., length, integer(1)) >= min_gs_size]
if (has_features) {
  feat_dir_gs <- merged_results %>% 
  select(
    overlaps_ref, status, any_of(c("feature")), starts_with("gene")
  ) %>% 
  as_tibble() %>% 
  mutate(group = glue("{feature} / {target} {status}")) %>% 
  split(.$group)%>% 
  lapply(pull, "gene_id") %>% 
  lapply(unlist) %>% 
  lapply(unique) %>% 
  .[vapply(., length, integer(1)) >= min_gs_size]
}

DE Genes Associated with Changed AR Binding

de_genes_db_regions <- tibble()
any_diff_sites <- sum(mcols(merged_results)[[fdr_column]] < fdr_alpha) > 0
any_de <- any(rnaseq$DE)
para <- ""
if (any_de & any_diff_sites) {
  ft_data <- rnaseq %>%
    dplyr::filter(gene_id %in% unlist(diff_windows)) %>% 
    mutate(
      window = case_when(
        gene_id %in% unlist(diff_windows[c("Up", "Down")]) ~ "Diff",
        TRUE ~ "Unchanged"
      ) %>%
        fct_infreq()
    ) %>% 
    group_by(DE, window) %>% 
    tally() %>% 
    pivot_wider(
      names_from = "DE", values_from = "n", 
      names_prefix = "DE_", values_fill = 0
    ) %>% 
    as.data.frame() %>% 
    column_to_rownames("window") %>% 
    as.matrix()
  ft <- fisher.test(ft_data)
  para <- glue(
    "
    Using the {comma(length(unique(unlist(diff_windows))))} genes mapped to a 
    {target} binding site, genes were tested for any association between sites 
    showing evidence of diferential binding and differential expression.
    {comma(length(unique(unlist(diff_windows[c('Up', 'Down')]))))} genes were 
    mapped to at least one site showing differential {target} binding with 
    {comma(ft_data['Diff', 'DE_TRUE'])} 
    ({percent(ft_data['Diff', 'DE_TRUE'] / sum(ft_data['Diff',]))}) of these
    being considered differentially expressed. This compares to 
    {percent(ft_data['Unchanged', 'DE_TRUE'] / sum(ft_data['Unchanged', ]), 0.1)}
    of genes where no differential {target} binding was detected. Fisher's 
    Exact test for any association gave this a p-value of 
    {ifelse(ft$p.value < 0.01, sprintf('%.2e', ft$p.value), round(ft$p.value, 3))}.
    "
  )
}

Using the 6,034 genes mapped to a AR binding site, genes were tested for any association between sites showing evidence of diferential binding and differential expression. 5,001 genes were mapped to at least one site showing differential AR binding with 97 (2%) of these being considered differentially expressed. This compares to 0.5% of genes where no differential AR binding was detected. Fisher’s Exact test for any association gave this a p-value of 2.92e-04.

cmn_genes <- intersect(
  rnaseq %>% dplyr::filter(!!sym(rna_fdr_col) < fdr_alpha) %>% pull("gene_id"),
  merged_results %>% 
    filter(!!sym(fdr_column) < fdr_alpha) %>% 
    as_tibble() %>% 
    pull("gene_id") %>% 
    unlist()
)
de_genes_db_regions <- merged_results %>% 
  filter(!!sym(fdr_column) < fdr_alpha) %>% 
  filter(vapply(gene_id, function(x) any(x %in% cmn_genes), logical(1))) %>% 
  as_tibble() %>% 
  dplyr::select(
    range, ChIP_logCPM = AveExpr, ChIP_logFC = logFC, ChIP_FDR := !!sym(fdr_column), overlaps_ref,
    status, any_of(c("region", "feature")), gene_id
  ) %>% 
  unnest(gene_id) %>% 
  dplyr::filter(gene_id %in% cmn_genes) %>% 
  left_join(
    rnaseq %>% 
      dplyr::filter(gene_id %in% cmn_genes) %>% 
      dplyr::select(
        all_of(c("gene_id", "gene_name")), 
        RNA_logFC := !!sym(rna_lfc_col), RNA_FDR := !!sym(rna_fdr_col)
      ),
    by = "gene_id"
  ) %>% 
  arrange(RNA_FDR) %>% 
  # dplyr::slice(1:20) %>%
  mutate(
    dist2Gene = mapply(
      function(x, y) {
        distance(GRanges(x), subset(gtf_gene, gene_id == y)) 
      },
      x = range,
      y = gene_id
    )
  ) %>% 
  mutate(
    region = case_when(
      dist2Gene > extra_params$gene_regions$upstream & str_detect(region, "Promoter|Exon|Intron") ~ paste(
        str_extract(region, ".*(Promoter|Exon|Intron)"), "(Other Gene)"),
      TRUE ~ as.character(region)
    ),
    region = factor(
      region, 
      levels = c(
        levels(merged_results$region), 
        paste(
          str_extract(levels(merged_results$region), ".*(Promoter|Exon|Intron)"), 
          "(Other Gene)"
        )
      ) %>% 
        unique()
    )
  ) %>% 
  dplyr::select(
    gene_name, gene_id, starts_with("RNA"), range, dist2Gene, 
    starts_with("ChIP"), everything()
  ) %>% 
  droplevels()

range_js <- JS(
  "function(values) {
    var min_val = Math.round(100 * Math.min(...values)) / 100
    var max_val = Math.round(100 * Math.max(...values)) / 100
    if (min_val == max_val) {
      var ret_val = min_val.toString()
    } else {
      var ret_val = '[' + min_val.toString() + ', ' + max_val.toString() + ']'
    }
    return ret_val
  }"
)
max_signal <- max(merged_results$AveExpr)
feat_def <- NULL
if (has_features) feat_def = list(
  feature = colDef (
    name = "Feature",
    aggregate = "unique"
  )
)
cp <-  htmltools::em(
  glue(
    "
    Differentially Expressed (DE) genes were checked for {target} binding
    regions which showed evidence of changed binding. 
    {comma(length(unique(de_genes_db_regions$range)), 1)} unique regions were 
    mapped to {comma(length(unique(de_genes_db_regions$gene_id)), 1)} DE genes. 
    Data are presented in a gene-centric manner and ranges may map to multiple
    genes. The full set of ranges considered to be showing evidence of altered 
    {target} binding are shown for each gene. The summarised range containing 
    all mapped ranges is given in the collapsed row, along with the range of 
    distances to gene and the range of signal levels. In the collapsed row, the 
    value for logFC represents the average across all mapped regions, with the 
    maximum FDR also shown. If {target} binding occurs within another gene this 
    is indicated in the 'Region' column.
    All sites and genes were exported as {basename(all_out$de_genes)}.
    "
  )
)
fs <- 12
tbl <- de_genes_db_regions %>% 
  reactable(
    groupBy = "gene_name",
    filterable = TRUE,
    borderless = TRUE,
    showPageSizeOptions = TRUE,
    columns = c(
      list(
        gene_name = colDef(
          name = "Gene", maxWidth = fs*11,
          style = list(
            borderRight = "1px solid rgba(0, 0, 0, 0.1)",
            fontSize = fs
          )
        ),
        gene_id = colDef(
          name = "ID", aggregate = "unique", cell = function(value) "",
          show = FALSE
        ),
        RNA_logFC = colDef(
          name = "logFC",
          aggregate = "mean", 
          style = JS(
            glue(
            "function(rowInfo) {
              var value = rowInfo.row['RNA_logFC']
              if (value < 0) {
                var color = '{{direction_colours[['Down']]}}'
              } else if (value > 0) {
                var color = '{{direction_colours[['Up']]}}'
              } else {
                var color = '#000000'
              }
              return { color: color, fontSize: {{fs}} }
            }",
            .open = "{{", .close = "}}"
            )
          ),
          format = colFormat(digits= 2),
          maxWidth = fs*6,
          cell = function(value) ""
        ),
        RNA_FDR = colDef(
          name = "FDR",
          aggregate = "min",
          format = colFormat(digits = 4),
          maxWidth = fs*6,
          cell = function(value) "",
          style = list(
            borderRight = "1px solid rgba(0, 0, 0, 0.1)", 
            fontSize = fs
          )
        ),
        range = colDef(
          name = "Range",
          minWidth = 160,
          aggregate = JS(
            "function(values){
              var chrom = [];
              var rng = [];
              var start = [];
              var end = [];
              for (i = 0; i < values.length; i++) {
                chrom[i] = values[i].split(':')[0];
                rng[i] = values[i].split(':')[1];
                start[i] = rng[i].split('-')[0];
                end[i] = rng[i].split('-')[1];
              }
              
              min_start = Math.min(...start);
              max_end = Math.max(...end);
              var ret_val = [...new Set(chrom)] + ':' + min_start.toString() + '-' + max_end.toString();
              
              return ret_val
              
            }"
          )
        ),
        dist2Gene = colDef(
          name = "Distance to Gene",
          aggregate = range_js,
          format = colFormat(digits = 0, separators = TRUE),
          maxWidth = fs*8,
          style = list(
            borderRight = "1px solid rgba(0, 0, 0, 0.1)",
            fontSize = fs
          ),
        ),
        ChIP_logCPM = colDef(
          name = "logCPM",
          maxWidth = fs*6,
          aggregate = range_js,
          format = colFormat(digits = 2),
          style = function(value) bar_style(
            width = value / max_signal, fontSize = fs
          )
        ),
        ChIP_logFC = colDef(
          name = "logFC",
          aggregate = "mean",
          maxWidth = fs*5,
          format = colFormat(digits = 2),
          cell = function(value) sprintf("%.2f", value),
          style =  JS(
            glue(
            "function(rowInfo) {
              var value = rowInfo.row['ChIP_logFC']
              if (value < 0) {
                var color = '{{direction_colours[['Down']]}}'
              } else if (value > 0) {
                var color = '{{direction_colours[['Up']]}}'
              } else {
                var color = '#000000'
              }
              return { color: color, fontSize: {{fs}} }
            }",
            .open = "{{", .close = "}}"
            )
          )
        ),
        ChIP_FDR = colDef(
          name = "FDR",
          aggregate = "max",
          maxWidth = 10 + fs*5,
          format = colFormat(prefix = "< ", digits = 4),
          cell = function(value) ifelse(
            value < 0.001,
            sprintf("%.2e", value),
            sprintf("%.3f", value)
          )
        ),
        overlaps_ref = colDef(
          name = "Macs Peak",
          show = FALSE,
          aggregate = "frequency",
          cell = function(value) ifelse(value, "\u2714 Yes", "\u2716 No"),
          style = function(value) {
            color <- ifelse(value, "#008000", "#e00000")
            list(color = color, fontSize = fs)
          },
          maxWidth = fs*5
        ),
        status = colDef(
          name = "Status",
          aggregate = "frequency",
          maxWidth = fs*6,
          cell = function(value) {
            html_symbol <- ""
            if (str_detect(value, "Up")) html_symbol <- "\u21E7"
            if (str_detect(value, "Down")) html_symbol <- "\u21E9"
            paste(html_symbol, value)
          },
          style = JS(
            glue(
            "function(rowInfo) {
              var value = rowInfo.row['ChIP_logFC']
              if (value < 0) {
                var color = '{{direction_colours[['Down']]}}'
              } else if (value > 0) {
                var color = '{{direction_colours[['Up']]}}'
              } else {
                var color = '#000000'
              }
              return { color: color, fontSize: {{fs}} }
            }",
            .open = "{{", .close = "}}"
            )
          )
        ),
        region = colDef(
          name = "Binding Region",
          aggregate = "unique"
        )
      ),
      feat_def
    ),
    columnGroups = list(
      colGroup(name = "RNA-Seq", columns = c("RNA_logFC", "RNA_FDR")),
      colGroup(
        name = paste(target, "ChIP-Seq"), 
        columns = c("ChIP_logCPM", "ChIP_logFC", "ChIP_FDR", "status")
      )
    ),
    theme = reactableTheme(style = list(fontSize = fs))
  )
div(class = "table",
  div(class = "table-header",
      div(class = "caption", cp),
      tbl
  )
)

Differentially Expressed (DE) genes were checked for AR binding regions which showed evidence of changed binding. 298 unique regions were mapped to 97 DE genes. Data are presented in a gene-centric manner and ranges may map to multiple genes. The full set of ranges considered to be showing evidence of altered AR binding are shown for each gene. The summarised range containing all mapped ranges is given in the collapsed row, along with the range of distances to gene and the range of signal levels. In the collapsed row, the value for logFC represents the average across all mapped regions, with the maximum FDR also shown. If AR binding occurs within another gene this is indicated in the 'Region' column. All sites and genes were exported as AR_E2_E2DHT-DE_genes.csv.

Volcano Plots By AR Binding

diff_tbl <- diff_windows %>% 
  lapply(function(x) tibble(gene_id = x)) %>% 
  bind_rows(.id = "status")
rnaseq %>% 
  left_join(diff_tbl) %>% 
  mutate(
    status = str_replace_na(status, "Undetected"),
    status = glue("{target} {status}") %>%
      factor(levels = paste(target, c("Up", "Down", "Ambiguous", "Unchanged", "Undetected")))
  ) %>% 
  left_join(rna_status) %>% 
  droplevels() %>% 
  ggplot(
    aes(!!sym(rna_lfc_col), -log10(!!sym(rna_p_col)), colour = de_status)
  ) +
  geom_point() +
  geom_text_repel(
    aes(label = gene_name),
    data = . %>% 
      dplyr::arrange(!!sym(rna_p_col)) %>% 
      dplyr::filter(!!sym(rna_lfc_col) > 0, DE) %>% 
      split(.$status) %>% 
      lapply(dplyr::slice, 1:2) %>% 
      bind_rows(),
    show.legend = FALSE
  ) +
  geom_text_repel(
    aes(label = gene_name),
    data = . %>% 
      dplyr::arrange(!!sym(rna_p_col)) %>% 
      dplyr::filter(!!sym(rna_lfc_col) < 0, DE) %>% 
      split(.$status) %>% 
      lapply(dplyr::slice, 1:2) %>% 
      bind_rows(),
    show.legend = FALSE
  ) +
  geom_text(
    aes(label = lab),
    data = . %>% 
      group_by(status) %>% 
      summarise(n = dplyr::n()) %>% 
      mutate(lab = glue("n = {comma(n)}")),
    x = min(rnaseq[[rna_lfc_col]]) + 0.9 * diff(range(rnaseq[[rna_lfc_col]])),
    y = 1,
    colour = "black",
    inherit.aes = FALSE,
    show.legend = FALSE
  ) +
  facet_wrap(~status, ncol = 2) +
  scale_colour_manual(
    values = direction_colours
  ) +
  labs(colour = "DE Status")

Volcano plot showing gene expression patterns separated by AR status. The two most highly ranked genes for up and down regulation are labelled in each panel. Given that some genes may be mapped to multiple AR binding events which include different binding patterns, genes may appear in multiple panels.

P-Values By AR Status

p <- rnaseq %>% 
  mutate(
    mapped = case_when(
      gene_id %in% unlist(diff_windows[names(diff_windows) %in% c("Up", "Down")]) ~ paste(target, "Differentially Bound"),
      gene_id %in% unlist(diff_windows[names(diff_windows) %in% c("Unchanged")]) ~ paste(target, "Unchanged"),
      TRUE ~ paste("No", target, "Detected")
    )
  ) %>% 
  ggplot(
    aes(!!sym(rna_p_col), stat(density))
  ) +
  geom_histogram(
    colour = "black", fill = "grey", bins = 100
  ) +
  facet_wrap(~mapped, nrow = 1) +
  scale_y_continuous(expand = expansion(c(0, 0.05))) +
  labs(y = "Density", x = "P (Differential Expression)")
y <- 0.95 * max(layer_scales(p)$y$range$range)
p  +
  geom_text(
    aes(0.8, y, label = lab),
    data = . %>% 
      group_by(mapped) %>% 
      tally() %>% 
      mutate(lab = glue("N = {comma(n, 1)}"))
  )

P-Values for differential expression partitioned by AR ChIP peak status. Genes were considered as being mapped to a differentially bound peak if one or more peaks was considered as differentially bound. No IHW procedure was undertaken using this data.

ChIP logFC Vs RNA logFC

de_genes_db_regions %>% 
  dplyr::filter(ChIP_FDR < fdr_alpha) %>% 
  group_by(gene_name) %>%
  dplyr::filter(ChIP_FDR == min(ChIP_FDR)) %>%
  mutate(DE = ifelse(RNA_logFC > 0, "Up", "Down")) %>% 
  ungroup() %>% 
  ggplot(
    aes(ChIP_logFC, RNA_logFC, colour = DE)
  ) +
  geom_point() +
  geom_label_repel(
    aes(label = lab),
    data = . %>% 
      mutate(grp = paste(DE, status)) %>% 
      # arrange(desc(abs(ChIP_logFC))) %>% 
      arrange(ChIP_FDR, RNA_FDR) %>% 
      split(.$grp) %>% 
      lapply(dplyr::slice, 1) %>% 
      bind_rows() %>% 
      mutate(
        range = GRanges(range) %>% 
          resize(width = 1, fix = "center") %>% 
          as.character(),
        lab = paste(gene_name, "\n", range)
      ),
    show.legend = FALSE,
    alpha = 0.7
  ) +
  geom_text(
    aes(x, y, label = lab),
    data = . %>% 
      group_by(DE, status) %>% 
      summarise(
        n = dplyr::n(),
        lab = glue("n = {comma(n)}"),
        x = max(abs(ChIP_logFC)) * sign(min(ChIP_logFC)) * 1.05,
        y = max(abs(RNA_logFC)) * sign(min(RNA_logFC)) * 1.05,
        .groups = "drop"
      ) %>% 
      group_by(DE) %>% 
      mutate(y = max(abs(y)) * sign(y)) %>% 
      ungroup() %>% 
      group_by(status) %>% 
      mutate(x = max(abs(x)) * sign(x)),
    colour = "black"
  ) +
  geom_hline(yintercept = 0, colour = "grey20") +
  geom_vline(xintercept = 0, colour = "grey20") +
  scale_colour_manual(values = direction_colours[c("Up", "Down")]) +
  labs(
    x = glue("logFC ({target} ChIP-Seq)"),
    y = "logFC (RNA-Seq)",
    colour = "Differential\nExpression"
  )

logFC values for differentially bound ChIP-Seq peaks mapped to DE genes. The most highly ranked peak for each quandrant is labelled at showing both the gene and centre of the binding region. Whilst binding sites may be mapped to multiple genes, only the most highly ranked binding site is shown mapped to the mosthighly ranked DE genes. Points are coloured by differential expression status.

Binding Pattern GSEA

GSEA By Direction of Regulation

GSEA analysis was first performed by taking the genes mapped to various sets of peaks, and checking for enrichment amongst the RNA-Seq results, ranking genes from up- to down-regulated, by significance of the logFC estimates.

All Windows

gsea_dir <- fgsea(dir_gs, rna_dir_ranks)
gsea_dir_sig <- gsea_dir %>% 
  arrange(pval) %>% 
  mutate(padj = p.adjust(pval, adj_method)) %>% 
  dplyr::filter(padj < enrich_alpha, size >= min_sig) %>% 
  as_tibble() 
p <- gsea_dir_sig %>% 
  dplyr::slice(1:9) %>% 
  pull("pathway") %>% 
  lapply(
    function(x) {
      plotEnrichment(dir_gs[[x]], rna_dir_ranks) +
        ggtitle(x) +
        theme(
          plot.title = element_text(hjust = 0.5, size = 10)
        )
    }
  )

cp <-  htmltools::em(
  glue(
    "Merged windows were mapped to genes and their position amongst the ",
    "RNA-Seq results was assessed. Windows were classified based on  ",
    "the direction of {target} binding with ",
    "{treat_levels[[2]]} treatment. {nrow(gsea_dir_sig)} sets of windows were ",
    "associated with changes in gene expression, using the sign of ", 
    "fold-change and ranking statistic to initially rank the ", 
    "{comma(nrow(rnaseq), 1)} genes considered as detected."
  )
)
tbl <- gsea_dir_sig %>%
  mutate(
    `Edge Size` = vapply(leadingEdge, length, integer(1)),
    leadingEdge = lapply(leadingEdge, function(x) id2gene[x]) %>% 
      vapply(paste, character(1), collapse = "; "),
    Direction = ifelse(NES > 0, "\u21E7 Up-regulated", "\u21E9 Down-regulated"),
    "{target}" := str_extract(pathway, "(Up|Down|Unchanged)$"),
    pathway = str_remove_all(pathway, " / [^/]+$")
  ) %>% 
  dplyr::select(
    !!sym(target), Windows = size, Direction, 
    p = pval, padj, `Edge Size`, `Leading Edge` = leadingEdge
  ) %>% 
  reactable(
    filterable = TRUE,
    columns = list2(
      "{target}" := colDef(
        maxWidth = 90,
        cell = function(value) {
          html_symbol <- ""
          if (str_detect(value, "Up")) html_symbol <- "\u21E7"
          if (str_detect(value, "Down")) html_symbol <- "\u21E9"
          paste(html_symbol, value)
        },
        style = function(value) {
          colour <- case_when(
            str_detect(value, "Up") ~ colours$direction[["up"]],
            str_detect(value, "Down") ~ colours$direction[["down"]],
            TRUE ~ colours$direction[["unchanged"]]
          )
          list(color = colour)
        }
      ),
      Windows = colDef(maxWidth = 80, format = colFormat(separators = TRUE)),
      Direction = colDef(
        name = "Gene Direction",
        maxWidth = 150,
        style = function(value) {
          colour <- ifelse(
            str_detect(value, "Up"), 
            colours$direction[["up"]], 
            colours$direction[["down"]]
          )
          list(color = colour)
        },
      ),
      p = colDef(
        cell = function(value) sprintf("%.2e", value), maxWidth = 80
      ),
      padj = colDef(
        name = glue("p<sub>{adj_method}</sub>"), html = TRUE,
        cell = function(value) {
          ifelse(
            value < 0.001, sprintf("%.2e", value), sprintf("%.3f", value)
          )
        }
      ),
      "Edge Size" = colDef(maxWidth = 80),
     "Leading Edge" = colDef(
       minWidth = 150,
       cell = function(value) with_tooltip(value, width = 50)
     ) 
    )
  )
div(class = "table",
  div(class = "table-header",
      div(class = "caption", cp),
      tbl
  )
)

Merged windows were mapped to genes and their position amongst the RNA-Seq results was assessed. Windows were classified based on the direction of AR binding with E2DHT treatment. 2 sets of windows were associated with changes in gene expression, using the sign of fold-change and ranking statistic to initially rank the 13,043 genes considered as detected.

wrap_plots(p)

Barcode plots for the top 2 sets of windows associated with directional changes in gene expression.

Windows By Region

gsea_reg_dir <- fgsea(reg_dir_gs, rna_dir_ranks)
gsea_reg_dir_sig <- gsea_reg_dir %>% 
  arrange(pval) %>% 
  mutate(padj = p.adjust(pval, adj_method)) %>% 
  dplyr::filter(padj < enrich_alpha, size >= min_sig) %>% 
  as_tibble() 
p <- gsea_reg_dir_sig %>% 
  dplyr::slice(1:9) %>% 
  pull("pathway") %>% 
  lapply(
    function(x) {
      plotEnrichment(reg_dir_gs[[x]], rna_dir_ranks) +
        ggtitle(x) +
        theme(
          plot.title = element_text(hjust = 0.5, size = 10)
        )
    }
  )

cp <-  htmltools::em(
  glue(
    "Merged windows were mapped to genes and their position amongst the ",
    "RNA-Seq results was assessed. Windows were classified based on which ",
    "region was the best overlap, and the direction of {target} binding with ",
    "{treat_levels[[2]]} treatment. {nrow(gsea_reg_dir_sig)} sets of windows were ",
    "associated with changes in gene expression, using the sign of ", 
    "fold-change and ranking statistic to initially rank the ", 
    "{comma(nrow(rnaseq), 1)} genes considered as detected."
  )
)
tbl <- gsea_reg_dir_sig %>%
  mutate(
    `Edge Size` = vapply(leadingEdge, length, integer(1)),
    leadingEdge = lapply(leadingEdge, function(x) id2gene[x]) %>% 
      vapply(paste, character(1), collapse = "; "),
    Direction = ifelse(NES > 0, "\u21E7 Up-regulated", "\u21E9 Down-regulated"),
    "{target}" := str_extract(pathway, "(Up|Down|Unchanged)$"),
    pathway = str_remove_all(pathway, " / [^/]+$")
  ) %>% 
  dplyr::select(
    pathway, !!sym(target), Windows = size, Direction, 
    p = pval, padj, `Edge Size`, `Leading Edge` = leadingEdge
  ) %>% 
  reactable(
    filterable = TRUE,
    columns = list2(
      pathway = colDef(
        minWidth = 100, name = "Region"
      ),
      "{target}" := colDef(
        maxWidth = 90,
        cell = function(value) {
          html_symbol <- ""
          if (str_detect(value, "Up")) html_symbol <- "\u21E7"
          if (str_detect(value, "Down")) html_symbol <- "\u21E9"
          paste(html_symbol, value)
        },
        style = function(value) {
          colour <- case_when(
            str_detect(value, "Up") ~ colours$direction[["up"]],
            str_detect(value, "Down") ~ colours$direction[["down"]],
            TRUE ~ colours$direction[["unchanged"]]
          )
          list(color = colour)
        }
      ),
      Windows = colDef(maxWidth = 80, format = colFormat(separators = TRUE)),
      Direction = colDef(
        name = "Gene Direction",
        maxWidth = 150,
        style = function(value) {
          colour <- ifelse(
            str_detect(value, "Up"), 
            colours$direction[["up"]], 
            colours$direction[["down"]]
          )
          list(color = colour)
        },
      ),
      p = colDef(
        cell = function(value) sprintf("%.2e", value), maxWidth = 80
      ),
      padj = colDef(
        name = glue("p<sub>{adj_method}</sub>"), html = TRUE,
        cell = function(value) {
          ifelse(
            value < 0.001, sprintf("%.2e", value), sprintf("%.3f", value)
          )
        }
      ),
      "Edge Size" = colDef(maxWidth = 80),
     "Leading Edge" = colDef(
       minWidth = 150,
       cell = function(value) with_tooltip(value, width = 50)
     ) 
    )
  )
div(class = "table",
  div(class = "table-header",
      div(class = "caption", cp),
      tbl
  )
)

Merged windows were mapped to genes and their position amongst the RNA-Seq results was assessed. Windows were classified based on which region was the best overlap, and the direction of AR binding with E2DHT treatment. 7 sets of windows were associated with changes in gene expression, using the sign of fold-change and ranking statistic to initially rank the 13,043 genes considered as detected.

wrap_plots(p)

Barcode plots for the top 7 sets of windows associated with directional changes in gene expression.

Windows By Feature

gsea_feat_dir <- fgsea(feat_dir_gs, rna_dir_ranks)
gsea_feat_dir_sig <- gsea_feat_dir %>% 
  arrange(pval) %>% 
  dplyr::filter(padj < enrich_alpha, size >= min_sig) %>% 
  as_tibble() 
p <- gsea_feat_dir_sig %>% 
  dplyr::slice(1:9) %>% 
  pull("pathway") %>% 
  lapply(
    function(x) {
      plotEnrichment(feat_dir_gs[[x]], rna_dir_ranks) +
        ggtitle(x) +
        theme(
          plot.title = element_text(hjust = 0.5, size = 10)
        )
    }
  )

cp <-  htmltools::em(
  glue(
    "Merged windows were mapped to genes and their position amongst the ",
    "RNA-Seq results was assessed. Windows were classified based on which ",
    "feature was the best overlap, and the direction of {target} binding with ",
    "{treat_levels[[2]]} treatment. {nrow(gsea_feat_dir_sig)} sets of windows were ",
    "associated with changes in gene expression, using the sign of ", 
    "fold-change and ranking statistic to initially rank the ", 
    "{comma(nrow(rnaseq), 1)} genes considered as detected."
  )
)
tbl <- gsea_feat_dir_sig %>%
  mutate(
    `Edge Size` = vapply(leadingEdge, length, integer(1)),
    leadingEdge = lapply(leadingEdge, function(x) id2gene[x]) %>% 
      vapply(paste, character(1), collapse = "; "),
    Direction = ifelse(NES > 0, "\u21E7 Up-regulated", "\u21E9 Down-regulated"),
    "{target}" := str_extract(pathway, "(Up|Down|Unchanged)$"),
    pathway = str_remove_all(pathway, " / [^/]+$")
  ) %>% 
  dplyr::select(
    pathway, !!sym(target), Windows = size, Direction, 
    p = pval, FDR = padj, `Edge Size`, `Leading Edge` = leadingEdge
  ) %>% 
  reactable(
    filterable = TRUE,
    columns = list2(
      pathway = colDef(
        minWidth = 100, name = "Region"
      ),
      "{target}" := colDef(
        maxWidth = 120,
        cell = function(value) {
          html_symbol <- ""
          if (str_detect(value, "Up")) html_symbol <- "\u21E7"
          if (str_detect(value, "Down")) html_symbol <- "\u21E9"
          paste(html_symbol, value)
        },
        style = function(value) {
          colour <- case_when(
            str_detect(value, "Up") ~ colours$direction[["up"]],
            str_detect(value, "Down") ~ colours$direction[["down"]],
            TRUE ~ colours$direction[["unchanged"]]
          )
          list(color = colour)
        }
      ),
      Windows = colDef(maxWidth = 80, format = colFormat(separators = TRUE)),
      Direction = colDef(
        name = "Gene Direction",
        maxWidth = 120,
        style = function(value) {
          colour <- ifelse(
            str_detect(value, "Up"), 
            colours$direction[["up"]], 
            colours$direction[["down"]]
          )
          list(color = colour)
        },
      ),
      p = colDef(
        cell = function(value) sprintf("%.2e", value), maxWidth = 80
      ),
      FDR = colDef(
        cell = function(value) sprintf("%.2e", value), maxWidth = 80
      ),
      "Edge Size" = colDef(maxWidth = 80),
     "Leading Edge" = colDef(
       minWidth = 150,
       cell = function(value) with_tooltip(value, width = 50)
     ) 
    )
  )
div(class = "table",
  div(class = "table-header",
      div(class = "caption", cp),
      tbl
  )
)

Merged windows were mapped to genes and their position amongst the RNA-Seq results was assessed. Windows were classified based on which feature was the best overlap, and the direction of AR binding with E2DHT treatment. 2 sets of windows were associated with changes in gene expression, using the sign of fold-change and ranking statistic to initially rank the 13,043 genes considered as detected.

wrap_plots(p)

Barcode plots for the top 2 sets of windows associated with directional changes in gene expression.

GSEA By Significance Only

GSEA analysis was then performed by taking the genes mapped to various sets of peaks, and checking for enrichment amongst the RNA-Seq results, ranking genes only from least to most significant, which effectively treats up- and down-regulation as indistinguishable.

All Windows

gsea_nondir <- fgsea(dir_gs, rna_sig_ranks, scoreType = "pos")
gsea_nondir_sig <- gsea_nondir %>% 
  arrange(pval) %>% 
  dplyr::filter(padj < enrich_alpha, size >= min_sig) %>% 
  as_tibble() 
p <- gsea_nondir_sig %>% 
  dplyr::slice(1:9) %>% 
  pull("pathway") %>% 
  lapply(
    function(x) {
      plotEnrichment(dir_gs[[x]], rna_sig_ranks) +
        ggtitle(x) +
        theme(plot.title = element_text(hjust = 0.5, size = 10))
    }
  )

cp <-  htmltools::em(
  glue(
    "Merged windows were mapped to genes and their position amongst the ",
    "RNA-Seq results was assessed. Windows were classified based on  ",
    "the direction of {target} binding with ",
    "{treat_levels[[2]]} treatment. {nrow(gsea_nondir_sig)} sets of windows were ",
    "associated with changes in gene expression, using only the p-value to ", 
    "rank the {comma(nrow(rnaseq), 1)} genes considered as detected."
  )
)
tbl <- gsea_nondir_sig %>%
  mutate(
    `Edge Size` = vapply(leadingEdge, length, integer(1)),
    leadingEdge = lapply(leadingEdge, function(x) id2gene[x]) %>% 
      vapply(paste, character(1), collapse = "; "),
    "{target}" := str_extract(pathway, "(Up|Down|Unchanged)$"),
    pathway = str_remove_all(pathway, " / [^/]+$")
  ) %>% 
  dplyr::select(
    !!sym(target), Windows = size, p = pval, FDR = padj, 
    `Edge Size`, `Leading Edge` = leadingEdge
  ) %>% 
  reactable(
    filterable = TRUE,
    columns = list2(
      "{target}" := colDef(
        maxWidth = 150,
        cell = function(value) {
          html_symbol <- ""
          if (str_detect(value, "Up")) html_symbol <- "\u21E7"
          if (str_detect(value, "Down")) html_symbol <- "\u21E9"
          paste(html_symbol, value)
        },
        style = function(value) {
           colour <- case_when(
            str_detect(value, "Up") ~ colours$direction[["up"]],
            str_detect(value, "Down") ~ colours$direction[["down"]],
            TRUE ~ colours$direction[["unchanged"]]
          )
          list(color = colour)
        }
      ),
      Windows = colDef(maxWidth = 80, format = colFormat(separators = TRUE)),
      p = colDef(
        cell = function(value) sprintf("%.2e", value), maxWidth = 80
      ),
      FDR = colDef(
        cell = function(value) sprintf("%.2e", value), maxWidth = 80
      ),
      "Edge Size" = colDef(maxWidth = 80),
     "Leading Edge" = colDef(
       minWidth = 150,
       cell = function(value) with_tooltip(value, width = 50)
     ) 
    )
  )
div(class = "table",
  div(class = "table-header",
      div(class = "caption", cp),
      tbl
  )
)

Merged windows were mapped to genes and their position amongst the RNA-Seq results was assessed. Windows were classified based on the direction of AR binding with E2DHT treatment. 3 sets of windows were associated with changes in gene expression, using only the p-value to rank the 13,043 genes considered as detected.

wrap_plots(p)

Barcode plots for the top 3 sets of windows associated with non-directional changes in gene expression.

Windows By Region

gsea_reg_nondir <- fgsea(reg_dir_gs, rna_sig_ranks, scoreType = "pos")
gsea_reg_nondir_sig <- gsea_reg_nondir %>% 
  arrange(pval) %>% 
  dplyr::filter(padj < enrich_alpha, size >= min_sig) %>% 
  as_tibble() 
p <- gsea_reg_nondir_sig %>% 
  dplyr::slice(1:9) %>% 
  pull("pathway") %>% 
  lapply(
    function(x) {
      plotEnrichment(reg_dir_gs[[x]], rna_sig_ranks) +
        ggtitle(x) +
        theme(plot.title = element_text(hjust = 0.5, size = 10))
    }
  )

cp <-  htmltools::em(
  glue(
    "Merged windows were mapped to genes and their position amongst the ",
    "RNA-Seq results was assessed. Windows were classified based on which ",
    "region was the best overlap, and the direction of {target} binding with ",
    "{treat_levels[[2]]} treatment. {nrow(gsea_reg_nondir_sig)} sets of windows were ",
    "associated with changes in gene expression, using only the p-value to ", 
    "rank the {comma(nrow(rnaseq), 1)} genes considered as detected."
  )
)
tbl <- gsea_reg_nondir_sig %>%
  mutate(
    `Edge Size` = vapply(leadingEdge, length, integer(1)),
    leadingEdge = lapply(leadingEdge, function(x) id2gene[x]) %>% 
      vapply(paste, character(1), collapse = "; "),
    "{target}" := str_extract(pathway, "(Up|Down|Unchanged)$"),
    pathway = str_remove_all(pathway, " / [^/]+$")
  ) %>% 
  dplyr::select(
    pathway, !!sym(target), Windows = size, p = pval, FDR = padj, 
    `Edge Size`, `Leading Edge` = leadingEdge
  ) %>% 
  reactable(
    filterable = TRUE,
    columns = list2(
      pathway = colDef(
        minWidth = 100, name = "Region"
      ),
      "{target}" := colDef(
        maxWidth = 90,
        cell = function(value) {
          html_symbol <- ""
          if (str_detect(value, "Up")) html_symbol <- "\u21E7"
          if (str_detect(value, "Down")) html_symbol <- "\u21E9"
          paste(html_symbol, value)
        },
        style = function(value) {
           colour <- case_when(
            str_detect(value, "Up") ~ colours$direction[["up"]],
            str_detect(value, "Down") ~ colours$direction[["down"]],
            TRUE ~ colours$direction[["unchanged"]]
          )
          list(color = colour)
        }
      ),
      Windows = colDef(maxWidth = 80, format = colFormat(separators = TRUE)),
      p = colDef(
        cell = function(value) sprintf("%.2e", value), maxWidth = 80
      ),
      FDR = colDef(
        cell = function(value) sprintf("%.2e", value), maxWidth = 80
      ),
      "Edge Size" = colDef(maxWidth = 80),
     "Leading Edge" = colDef(
       minWidth = 150,
       cell = function(value) with_tooltip(value, width = 50)
     ) 
    )
  )
div(class = "table",
  div(class = "table-header",
      div(class = "caption", cp),
      tbl
  )
)

Merged windows were mapped to genes and their position amongst the RNA-Seq results was assessed. Windows were classified based on which region was the best overlap, and the direction of AR binding with E2DHT treatment. 12 sets of windows were associated with changes in gene expression, using only the p-value to rank the 13,043 genes considered as detected.

wrap_plots(p)

Barcode plots for the top 9 sets of windows associated with non-directional changes in gene expression.

Windows By Feature

gsea_feat_nondir <- fgsea(feat_dir_gs, rna_sig_ranks, scoreType = "pos")
gsea_feat_nondir_sig <- gsea_feat_nondir %>% 
  arrange(pval) %>% 
  dplyr::filter(padj < enrich_alpha, size >= min_sig) %>% 
  as_tibble() 
p <- gsea_feat_nondir_sig %>% 
  dplyr::slice(1:9) %>% 
  pull("pathway") %>% 
  lapply(
    function(x) {
      plotEnrichment(feat_dir_gs[[x]], rna_sig_ranks) +
        ggtitle(x) +
        theme(
          plot.title = element_text(hjust = 0.5, size = 10)
        )
    }
  )

cp <-  htmltools::em(
  glue(
    "Merged windows were mapped to genes and their position amongst the ",
    "RNA-Seq results was assessed. Windows were classified based on which ",
    "feature was the best overlap, and the direction of {target} binding with ",
    "{treat_levels[[2]]} treatment. {nrow(gsea_feat_nondir_sig)} sets of windows were ",
    "associated with overall changes in gene expression, using only the ", 
    "test statistic to rank the ", 
    "{comma(nrow(rnaseq), 1)} genes considered as detected."
  )
)
tbl <- gsea_feat_nondir_sig %>%
  mutate(
    `Edge Size` = vapply(leadingEdge, length, integer(1)),
    leadingEdge = lapply(leadingEdge, function(x) id2gene[x]) %>% 
      vapply(paste, character(1), collapse = "; "),
    Direction = ifelse(NES > 0, "\u21E7 Up-regulated", "\u21E9 Down-regulated"),
    "{target}" := str_extract(pathway, "(Up|Down|Unchanged)$"),
    pathway = str_remove_all(pathway, " / [^/]+$")
  ) %>% 
  dplyr::select(
    pathway, !!sym(target), Windows = size, Direction, 
    p = pval, FDR = padj, `Edge Size`, `Leading Edge` = leadingEdge
  ) %>% 
  reactable(
    filterable = TRUE,
    columns = list2(
      pathway = colDef(
        minWidth = 100, name = "Region"
      ),
      "{target}" := colDef(
        maxWidth = 90,
        cell = function(value) {
          html_symbol <- ""
          if (str_detect(value, "Up")) html_symbol <- "\u21E7"
          if (str_detect(value, "Down")) html_symbol <- "\u21E9"
          paste(html_symbol, value)
        },
        style = function(value) {
          colour <- case_when(
            str_detect(value, "Up") ~ colours$direction[["up"]],
            str_detect(value, "Down") ~ colours$direction[["down"]],
            TRUE ~ colours$direction[["unchanged"]]
          )
          list(color = colour)
        }
      ),
      Windows = colDef(maxWidth = 80, format = colFormat(separators = TRUE)),
      Direction = colDef(
        name = "Gene Direction",
        maxWidth = 120,
        style = function(value) {
          colour <- ifelse(
            str_detect(value, "Up"), 
            colours$direction[["up"]], 
            colours$direction[["down"]]
          )
          list(color = colour)
        },
      ),
      p = colDef(
        cell = function(value) sprintf("%.2e", value), maxWidth = 80
      ),
      FDR = colDef(
        cell = function(value) sprintf("%.2e", value), maxWidth = 80
      ),
      "Edge Size" = colDef(maxWidth = 80),
     "Leading Edge" = colDef(
       minWidth = 150,
       cell = function(value) with_tooltip(value, width = 50)
     ) 
    )
  )
div(class = "table",
  div(class = "table-header",
      div(class = "caption", cp),
      tbl
  )
)

Merged windows were mapped to genes and their position amongst the RNA-Seq results was assessed. Windows were classified based on which feature was the best overlap, and the direction of AR binding with E2DHT treatment. 4 sets of windows were associated with overall changes in gene expression, using only the test statistic to rank the 13,043 genes considered as detected.

wrap_plots(p)

Barcode plots for the top 4 sets of windows associated with nondirectional changes in gene expression.

Integration of RNA and ChIP Enrichment Analyses

rnaseq_gsea <- fgsea(
  pathways = gs_by_gsid,
  stats = rna_dir_ranks, 
  minSize = min_gs_size, 
  maxSize = max_gs_size
) %>% 
  dplyr::filter(!is.na(pval))
status_cols <- setNames(
  hcl.colors(4, "Viridis", rev = TRUE),
  c("Both", "ChIP Only", "RNA Only", "Neither")
)

The enrichment testing results from the AR differential binding analysis were then compared to GSEA results obtained from the RNA-Seq data set. Given the orthogonal nature of the two dataset, p-values from each analysis were combined using Wilkinson’s maximum p-value method and the resulting p-values adjusted as previously. Distances between nodes (gene-sets) for all network plots were determined by using ChIP target genes within the leading edge only.

Differentially Bound Windows

cmn_diff <- rnaseq_gsea %>%
  dplyr::select(-log2err, -ES, -size) %>% 
  left_join(
    goseq_diff, by = c("pathway" = "gs_name"), suffix = c(".rna", ".chip")
  ) %>%
  dplyr::filter(!is.na(pval.rna), !is.na(pval.chip)) %>% 
  nest(p = starts_with("pval")) %>% 
  mutate(
    pval = vapply(p, function(x) metap::maximump(unlist(x))$p, numeric(1))
  ) %>%
  unnest(p) %>% 
  dplyr::rename(padj_chip = adj_p, padj_rna = padj) %>% 
  mutate(adj_p = p.adjust(pval, extra_params$enrichment$adj)) %>% 
  arrange(pval) %>% 
  dplyr::filter(adj_p < enrich_alpha) %>% 
  mutate(
    leadingEdge = lapply(
      leadingEdge, intersect, dplyr::filter(mapped_ids, up | down)$gene_id
    ),
    numDEInCat = vapply(leadingEdge, length, integer(1)),
    Status = case_when(
      padj_rna < enrich_alpha & padj_chip < enrich_alpha ~ "Both",
      padj_rna < enrich_alpha & !padj_chip < enrich_alpha ~ "RNA Only",
      !padj_rna < enrich_alpha & padj_chip < enrich_alpha ~ "ChIP Only",
      !padj_rna < enrich_alpha & !padj_chip < enrich_alpha ~ "Neither"
    ) %>% 
      factor(levels = names(status_cols))
  ) %>% 
  dplyr::filter(numDEInCat >= min_sig) %>% 
  dplyr::select(
    gs_name = pathway, adj_p, Status, NES, starts_with("padj"),
    starts_with("num"), leadingEdge
  ) 
tg_diff <- cmn_diff %>% 
  make_tbl_graph(setNames(.$leadingEdge, .$gs_name))
plot_net <- length(tg_diff) >= min_network_size
txt <- ifelse(
  nrow(cmn_diff) == 0,
  glue("No gene sets were jointly enriched between all {target} differentially bound windows and the RNA-Seq data"),
  "#### Results Table"
)

No gene sets were jointly enriched between all AR differentially bound windows and the RNA-Seq data

tg_diff %>% 
  activate(nodes) %>% 
  mutate(direction = ifelse(NES > 0, "Up", "Down"))%>% 
  ggraph(layout = net_layout, weights = oc^2) +
  geom_edge_link(aes(width = oc^2, alpha = oc^2)) +
  geom_node_point(
    aes(fill = Status, size = numDEInCat),
    shape = 21
  ) +
  geom_node_text(
    aes(label = label, colour = direction),
    size = 3, 
    data = . %>%
      mutate(
        label = str_replace_all(label, "_", " ") %>% str_trunc(80) %>% str_wrap(width = 18)
      ),
    repel = TRUE, max.overlaps = max(10, round(length(tg_diff) / 4, 0)),
    bg.color = "white", bg.r = 0.1, 
  ) +
  scale_x_continuous(expand = expansion(c(0.1, 0.1))) +
  scale_y_continuous(expand = expansion(c(0.1, 0.1))) +
  scale_fill_manual(values = status_cols) +
  scale_size_continuous(range = c(1, 10)) +
  scale_edge_width(range = c(1, 6), limits = c(0, 1)) +
  scale_edge_alpha(range = c(0.1, 0.4), limits = c(0, 1))  +
  scale_colour_manual(
    values = setNames(direction_colours, str_to_title(names(direction_colours)))
  ) +
  guides(edge_alpha= "none") +
  labs(
    size = glue("{target} Targets In Leading Edge") %>% str_wrap(15),
    colour = "GSEA\nDirection",
    fill = "Prior\nSignificance",
    edge_width = expr(paste(OC^2))
  ) +
  theme_void() +
  theme(legend.text = element_text(hjust = 0.5))

Windows With Increased AR

cmn_up <- rnaseq_gsea %>%
  dplyr::select(-log2err, -ES, -size) %>% 
  left_join(
    goseq_up, by = c("pathway" = "gs_name"), suffix = c(".rna", ".chip")
  ) %>%
  dplyr::filter(!is.na(pval.rna), !is.na(pval.chip)) %>% 
  nest(p = starts_with("pval")) %>% 
  mutate(
    pval = vapply(p, function(x) metap::maximump(unlist(x))$p, numeric(1))
    ) %>%
  unnest(p) %>% 
  dplyr::rename(padj_chip = adj_p, padj_rna = padj) %>% 
  mutate(adj_p = p.adjust(pval, extra_params$enrichment$adj)) %>% 
  arrange(pval) %>% 
  dplyr::filter(adj_p < enrich_alpha) %>% 
  mutate(
    leadingEdge = lapply(
      leadingEdge, intersect, dplyr::filter(mapped_ids, up)$gene_id
    ),
    numDEInCat = vapply(leadingEdge, length, integer(1)),
    Status = case_when(
      padj_rna < enrich_alpha & padj_chip < enrich_alpha ~ "Both",
      padj_rna < enrich_alpha & !padj_chip < enrich_alpha ~ "RNA Only",
      !padj_rna < enrich_alpha & padj_chip < enrich_alpha ~ "ChIP Only",
      !padj_rna < enrich_alpha & !padj_chip < enrich_alpha ~ "Neither"
    ) %>% 
      factor(levels = names(status_cols))
  ) %>% 
  dplyr::filter(numDEInCat >= min_sig) %>% 
  dplyr::select(
    gs_name = pathway, adj_p, Status, NES, starts_with("padj"),
    starts_with("num"), leadingEdge
  ) 
tg_up <- cmn_up %>% 
  make_tbl_graph(
    setNames(.$leadingEdge, .$gs_name)
  )
plot_net <- length(tg_up) >= min_network_size
txt <- ifelse(
  nrow(cmn_up) == 0,
  glue("No gene sets were jointly enriched between all increasing {target} bound windows and the RNA-Seq data"),
  "#### Results Table"
)

No gene sets were jointly enriched between all increasing AR bound windows and the RNA-Seq data

tg_up %>% 
  activate(nodes) %>% 
  mutate(direction = ifelse(NES > 0, "Up", "Down")) %>% 
  ggraph(layout = net_layout, weights = oc^2) +
  geom_edge_link(aes(width = oc^2, alpha = oc^2)) +
  geom_node_point(
    aes(fill = Status, size = numDEInCat),
    shape = 21
  ) +
  geom_node_text(
    aes(label = label, colour = direction),
    size = 3, 
    data = . %>%
      mutate(
        label = str_replace_all(label, "_", " ") %>% str_trunc(80) %>% str_wrap(width = 18)
      ),
    repel = TRUE, max.overlaps = max(10, round(length(tg_up) / 4, 0)),
    bg.color = "white", bg.r = 0.1, 
  ) +
  scale_x_continuous(expand = expansion(c(0.1, 0.1))) +
  scale_y_continuous(expand = expansion(c(0.1, 0.1))) +
  scale_fill_manual(values = status_cols) +
  scale_size_continuous(range = c(1, 10)) +
  scale_edge_width(range = c(1, 6), limits = c(0, 1)) +
  scale_edge_alpha(range = c(0.1, 0.4), limits = c(0, 1))  +
  scale_colour_manual(
    values = setNames(direction_colours, str_to_title(names(direction_colours)))
  ) +
  guides(edge_alpha= "none") +
  labs(
    size = glue("{target} Targets In Leading Edge") %>% str_wrap(15),
    colour = "GSEA\nDirection",
    fill = "Prior\nSignificance",
    edge_width = expr(paste(OC^2))
  ) +
  theme_void() +
  theme(legend.text = element_text(hjust = 0.5))

Windows With Decreased AR

cmn_down <- rnaseq_gsea %>%
  dplyr::select(-log2err, -ES, -size) %>% 
  left_join(
    goseq_down, by = c("pathway" = "gs_name"), suffix = c(".rna", ".chip")
  ) %>%
  dplyr::filter(!is.na(pval.rna), !is.na(pval.chip)) %>% 
  nest(p = starts_with("pval")) %>% 
  mutate(
    pval = vapply(p, function(x) metap::maximump(unlist(x))$p, numeric(1))
    ) %>%
  dplyr::rename(padj_chip = adj_p, padj_rna = padj) %>% 
  mutate(adj_p = p.adjust(pval, extra_params$enrichment$adj)) %>% 
  arrange(pval) %>% 
  dplyr::filter(adj_p < enrich_alpha) %>% 
  mutate(
    leadingEdge = lapply(
      leadingEdge, intersect, dplyr::filter(mapped_ids, down)$gene_id
    ),
    numDEInCat = vapply(leadingEdge, length, integer(1)),
    Status = case_when(
      padj_rna < enrich_alpha & padj_chip < enrich_alpha ~ "Both",
      padj_rna < enrich_alpha & !padj_chip < enrich_alpha ~ "RNA Only",
      !padj_rna < enrich_alpha & padj_chip < enrich_alpha ~ "ChIP Only",
      !padj_rna < enrich_alpha & !padj_chip < enrich_alpha ~ "Neither"
    ) %>% 
      factor(levels = names(status_cols))
  ) %>% 
  dplyr::filter(numDEInCat >= min_sig) %>% 
  dplyr::select(
    gs_name = pathway, adj_p, Status, NES, starts_with("padj"),
    starts_with("num"), leadingEdge
  ) 
tg_down <- cmn_down %>% 
  make_tbl_graph(
    setNames(.$leadingEdge, .$gs_name)
  )
plot_net <- length(tg_down) >= min_network_size
txt <- ifelse(
  nrow(cmn_down) == 0,
  glue("No gene sets were jointly enriched between all decreasing {target} bound windows and the RNA-Seq data"),
  "#### Results Table"
)

No gene sets were jointly enriched between all decreasing AR bound windows and the RNA-Seq data

tg_down %>% 
  activate(nodes) %>% 
  mutate(direction = ifelse(NES > 0, "Up", "Down")) %>% 
  ggraph(layout = net_layout, weights = oc^2) +
  geom_edge_link(aes(width = oc^2, alpha = oc^2)) +
  geom_node_point(
    aes(fill = Status, size = numDEInCat),
    shape = 21
  ) +
  geom_node_text(
    aes(label = label, colour = direction),
    size = 3, 
    data = . %>%
      mutate(
        label = str_replace_all(label, "_", " ") %>% str_trunc(80) %>% str_wrap(width = 18)
      ),
    repel = TRUE, max.overlaps = max(10, round(length(tg_down) / 4, 0)),
    bg.color = "white", bg.r = 0.1, 
  ) +
  scale_x_continuous(expand = expansion(c(0.1, 0.1))) +
  scale_y_continuous(expand = expansion(c(0.1, 0.1))) +
  scale_fill_manual(values = status_cols) +
  scale_size_continuous(range = c(1, 10)) +
  scale_edge_width(range = c(1, 6), limits = c(0, 1)) +
  scale_edge_alpha(range = c(0.1, 0.4), limits = c(0, 1))  +
  scale_colour_manual(
    values = setNames(direction_colours, str_to_title(names(direction_colours)))
  ) +
  guides(edge_alpha= "none") +
  labs(
    size = glue("{target} Targets In Leading Edge") %>% str_wrap(15),
    colour = "GSEA\nDirection",
    fill = "Prior\nSignificance",
    edge_width = expr(paste(OC^2))
  ) +
  theme_void() +
  theme(legend.text = element_text(hjust = 0.5))

Highly Ranked Genes

knitr::opts_chunk$set(dev = fig_type)
all_ids <- unlist(merged_results$gene_id)
db_ids <- unlist(filter(merged_results, !!sym(fdr_column) < fdr_alpha)$gene_id)
top_diff_win <- rnaseq %>% 
  mutate(
    n_windows = vapply(
      gene_id,
      function(x) sum(str_detect(all_ids, x)),
      integer(1)
    ),
    n_db_windows = vapply(
      gene_id,
      function(x) sum(str_detect(db_ids, x)),
      integer(1)
    )
  ) %>% 
  arrange(!!sym(rna_p_col)) %>% 
  dplyr::filter(n_windows > 1) %>% 
  dplyr::slice(1:5)
grl_to_plot <- top_diff_win$gene_id %>% 
  sapply(
    function(x) {
      subset(
        merged_results, 
        vapply(gene_id, function(id) x %in% id, logical(1))
      ) %>% 
        granges() %>% 
        ## Restrict this to only sites within 5Mb of a gene
        subsetByOverlaps(
          resize(subset(gtf_gene, gene_id == x), width = 1e7, fix = 'center')
        ) %>% 
        c(
          granges(subset(gtf_gene, gene_id == x))
        ) %>% 
        unstrand() %>% 
        sort()
    },
    simplify = FALSE
  ) %>% 
  lapply(range) %>% 
  GRangesList()
hfgc_genes <- hfgc_genes[names(hfgc_genes) != "Undetected"]
if (!is.null(config$external$coverage)) {
  y_ranges <- granges(unlist(grl_to_plot))
  y_lim <- c(
    y_lim[target],
    bwfl[names(bwfl) != target] %>% 
      lapply(
        function(x) {
          GRangesList(lapply(x, import.bw, which = y_ranges)) %>% 
            unlist() %>% 
            filter(score == max(score)) %>% 
            mcols() %>% 
            .[["score"]] %>% 
            c(0) %>% 
            range()
        }
      )
  )
}
i <- 1

Using the ranked list of genes from the RNA-Seq data, the 5 genes most highly ranked for differential expression, with more than one AR-bound window mapped to it were selected for visualisation.

AQP3

ct <- trans_models %>% 
  subsetByOverlaps(grl_to_plot[[i]]) %>% 
  as_tibble() %>% 
  pull("transcript") %>% 
  unique() %>% 
  length() %>% 
  is_greater_than(10) %>% 
  ifelse("meta", FALSE)
plotHFGC(
  grl_to_plot[[i]],
  hic = hic %>% 
     subsetByOverlaps(
       subset(gtf_gene, gene_id == names(grl_to_plot)[[i]])
     ) %>% 
    subsetByOverlaps(
      subsetByOverlaps(merged_results, grl_to_plot[[i]])
    ),
  features = feat_gr, featcol = feature_colours, featstack = "dense", 
  genes = hfgc_genes, genecol = gene_col, genesize = 0.7,
  coverage = bwfl,
  linecol = line_col,
  annotation = annot,
  annotcol = colours$direction %>%
    setNames(str_to_title(names(.))),
  cytobands = bands_df,
  collapseTranscripts = ct, # Set this as a function of n_transcripts
  zoom = 1.1,
  fontsize = 10,
  max = width(grl_to_plot[[i]]),
  ylim = y_lim,
  highlight = c(),
  col.title = "black", background.title = "white", showAxis = FALSE,
  rotation.title = 90
)

Region showing all merged AR-bound windows mapped to UGDH. Windows considered to be unchanged for AR are annotated in grey, with other colours indicating AR gain or loss. Mapped windows are only shown if within 5Mb of UGDH. Gene-centric regions and external features are shown in the top panel. The estimated logFC for UGDH is 1.52 with an FDR of 2.34e-10. Undetected and unchanged genes are shown on separate tracks.

i <- i + 1

UGDH

ct <- trans_models %>% 
  subsetByOverlaps(grl_to_plot[[i]]) %>% 
  as_tibble() %>% 
  pull("transcript") %>% 
  unique() %>% 
  length() %>% 
  is_greater_than(10) %>% 
  ifelse("meta", FALSE)
plotHFGC(
  grl_to_plot[[i]],
  hic = hic %>% 
     subsetByOverlaps(
       subset(gtf_gene, gene_id == names(grl_to_plot)[[i]])
     ) %>% 
    subsetByOverlaps(
      subsetByOverlaps(merged_results, grl_to_plot[[i]])
    ),
  features = feat_gr, featcol = feature_colours, featstack = "dense", 
  genes = hfgc_genes, genecol = gene_col, genesize = 0.7,
  coverage = bwfl,
  linecol = line_col,
  annotation = annot,
  annotcol = colours$direction %>%
    setNames(str_to_title(names(.))),
  cytobands = bands_df,
  collapseTranscripts = ct, # Set this as a function of n_transcripts
  zoom = 1.1,
  max = width(grl_to_plot[[i]]),
  fontsize = 10,
  ylim = y_lim,
  highlight = c(),
  col.title = "black", background.title = "white", showAxis = FALSE,
  rotation.title = 90
)

Region showing all merged AR-bound windows mapped to FKBP5. Windows considered to be unchanged for AR are annotated in grey, with other colours indicating AR gain or loss. Mapped windows are only shown if within 5Mb of FKBP5. Gene-centric regions and external features are shown in the top panel. The estimated logFC for FKBP5 is 2.00 with an FDR of 6.51e-10. Undetected and unchanged genes are shown on separate tracks.

i <- i + 1

FKBP5

ct <- trans_models %>% 
  subsetByOverlaps(grl_to_plot[[i]]) %>% 
  as_tibble() %>% 
  pull("transcript") %>% 
  unique() %>% 
  length() %>% 
  is_greater_than(10) %>% 
  ifelse("meta", FALSE)
plotHFGC(
  grl_to_plot[[i]],
  hic = hic %>% 
     subsetByOverlaps(
       subset(gtf_gene, gene_id == names(grl_to_plot)[[i]])
     ) %>% 
    subsetByOverlaps(
      subsetByOverlaps(merged_results, grl_to_plot[[i]])
    ),
  features = feat_gr, featcol = feature_colours, featstack = "dense", 
  genes = hfgc_genes, genecol = gene_col, genesize = 0.7,
  coverage = bwfl,
  linecol = line_col,
  annotation = annot,
  annotcol = colours$direction %>%
    setNames(str_to_title(names(.))),
  cytobands = bands_df,
  collapseTranscripts = ct, # Set this as a function of n_transcripts
  zoom = 1.1,
  max = width(grl_to_plot[[i]]),
  fontsize = 10,
  ylim = y_lim,
  highlight = c(),
  col.title = "black", background.title = "white", showAxis = FALSE,
  rotation.title = 90
)

Region showing all merged AR-bound windows mapped to SEC14L2. Windows considered to be unchanged for AR are annotated in grey, with other colours indicating AR gain or loss. Mapped windows are only shown if within 5Mb of SEC14L2. Gene-centric regions and external features are shown in the top panel. The estimated logFC for SEC14L2 is 2.65 with an FDR of 8.87e-10. Undetected and unchanged genes are shown on separate tracks.

i <- i + 1

SEC14L2

ct <- trans_models %>% 
  subsetByOverlaps(grl_to_plot[[i]]) %>% 
  as_tibble() %>% 
  pull("transcript") %>% 
  unique() %>% 
  length() %>% 
  is_greater_than(10) %>% 
  ifelse("meta", FALSE)
plotHFGC(
  grl_to_plot[[i]],
  hic = hic %>% 
     subsetByOverlaps(
       subset(gtf_gene, gene_id == names(grl_to_plot)[[i]])
     ) %>% 
    subsetByOverlaps(
      subsetByOverlaps(merged_results, grl_to_plot[[i]])
    ),
  features = feat_gr, featcol = feature_colours, featstack = "dense", 
  genes = hfgc_genes, genecol = gene_col, genesize = 0.7,
  coverage = bwfl,
  linecol = line_col,
  annotation = annot,
  annotcol = colours$direction %>%
    setNames(str_to_title(names(.))),
  cytobands = bands_df,
  collapseTranscripts = ct, # Set this as a function of n_transcripts
  zoom = 1.1,
  max = width(grl_to_plot[[i]]),
  fontsize = 10,
  ylim = y_lim,
  highlight = c(),
  col.title = "black", background.title = "white", showAxis = FALSE,
  rotation.title = 90
)

Region showing all merged AR-bound windows mapped to CLDN8. Windows considered to be unchanged for AR are annotated in grey, with other colours indicating AR gain or loss. Mapped windows are only shown if within 5Mb of CLDN8. Gene-centric regions and external features are shown in the top panel. The estimated logFC for CLDN8 is 2.31 with an FDR of 1.01e-09. Undetected and unchanged genes are shown on separate tracks.

i <- i + 1

CLDN8

ct <- trans_models %>% 
  subsetByOverlaps(grl_to_plot[[i]]) %>% 
  as_tibble() %>% 
  pull("transcript") %>% 
  unique() %>% 
  length() %>% 
  is_greater_than(10) %>% 
  ifelse("meta", FALSE)
plotHFGC(
  grl_to_plot[[i]],
  hic = hic %>% 
     subsetByOverlaps(
       subset(gtf_gene, gene_id == names(grl_to_plot)[[i]])
     ) %>% 
    subsetByOverlaps(
      subsetByOverlaps(merged_results, grl_to_plot[[i]])
    ),
  features = feat_gr, featcol = feature_colours, featstack = "dense", 
  genes = hfgc_genes, genecol = gene_col, genesize = 0.7,
  coverage = bwfl,
  linecol = line_col,
  annotation = annot,
  annotcol = colours$direction %>%
    setNames(str_to_title(names(.))),
  cytobands = bands_df,
  collapseTranscripts = ct, # Set this as a function of n_transcripts
  zoom = 1.1,
  max = width(grl_to_plot[[i]]),
  fontsize = 10,
  ylim = y_lim,
  highlight = c(),
  col.title = "black", background.title = "white", showAxis = FALSE,
  rotation.title = 90
)

Data Export

merged_results %>% 
  as_tibble() %>%
  unnest(all_of("gene_id")) %>%
  mutate(gene_name = id2gene[gene_id]) %>% 
  dplyr::select(
    gene_id, gene_name, range, AveExpr, logFC, P.Value, FDR = !!sym(fdr_column), 
    status, any_of(c("region", "feature")), macs2_peak = overlaps_ref,
  ) %>% 
  mutate(
    distance_to_gene = distance(
      GRanges(range),
      setNames(gtf_gene, gtf_gene$gene_id)[gene_id]
    )
  ) %>% 
  write_csv(
    gzfile(all_out$csv)
  )

write_rds(merged_results, all_out$results, compress = "gz")
write_rds(filtered_counts, all_out$windows, compress = "gz")
file.create(all_out$de_genes)
if (has_rnaseq) write_csv(de_genes_db_regions, all_out$de_genes)

file.create(all_out$up_regions)
if (sum(merged_results$status == "Up") > 0) {
  merged_results %>% 
    filter(status == "Up") %>% 
    granges() %>% 
    export.bed(all_out$up_regions)
}

file.create(all_out$down_regions)
if (sum(merged_results$status == "Down") > 0) {
  merged_results %>% 
    filter(status == "Down") %>% 
    granges() %>% 
    export.bed(all_out$up_regions)
}
## Enrichment results need a bit of tweaking to match the tables
list2(
  `All Differentially Bound` = goseq_diff,
  "All Increased {target}" := goseq_up,
  "All Decreased {target}" := goseq_down
) %>% 
  bind_rows(.id = 'group') %>% 
  dplyr::filter(adj_p < enrich_alpha) %>% 
  left_join(
    dplyr::select(msigdb, gs_name, gene_name, gene_id, gs_url, gs_description),
    by = "gs_name"
  ) %>% 
  dplyr::filter(
    gene_id %in% unlist(subset(merged_results, status != "Unchanged")$gene_id)
  ) %>% 
  dplyr::select(-gene_id) %>% 
  chop(gene_name) %>% 
  mutate(
    gene_name = vapply(gene_name, paste, character(1), collapse = ": "),
    `%` = numDEInCat / numInCat,
  ) %>% 
  dplyr::select(
    group,
    `Gene Set` = gs_name, Description = gs_description, URL = gs_url,
    `%`, numDEInCat, numInCat, p = pval, adj_p, 
    `Genes Mapped to DB Windows` = gene_name
  ) %>% 
  write_csv(all_out$enrichment)
list2(
  `All Differentially Bound` = cmn_diff,
  "All Increased {target}" := cmn_up,
  "All Decreased {target}" := cmn_down
) %>% 
  bind_rows(.id = "group") %>% 
  mutate(
    genes = vapply(leadingEdge, paste, character(1), collapse = "; "),
    leadingEdge = vapply(leadingEdge, paste, character(1), collapse = "; "),
  ) %>% 
  write_csv(all_out$rna_enrichment)
## Save key memory on the HDD
rm(window_counts)
gc()
save.image(all_out$renv)

During this workflow, the following files were exported:

results: output/differential_binding/AR/AR_E2_E2DHT-differential_binding.rds
csv: output/differential_binding/AR/AR_E2_E2DHT-differential_binding.csv.gz
windows: output/differential_binding/AR/AR_E2_E2DHT-filtered_windows.rds
up_regions: output/differential_binding/AR/AR_E2_E2DHT-up.bed
down_regions: output/differential_binding/AR/AR_E2_E2DHT-down.bed
de_genes: output/differential_binding/AR/AR_E2_E2DHT-DE_genes.csv
enrichment: output/differential_binding/AR/AR_E2_E2DHT-enrichment.csv
rna_enrichment: output/differential_binding/AR/AR_E2_E2DHT-rnaseq_enrichment.csv
renv: output/envs/AR_E2_E2DHT-differential_binding.RData

References

Hicks, S. C., and R. A. Irizarry. 2015. “quantro: a data-driven approach to guide the choice of an appropriate normalization method.” Genome Biol 16 (June): 117.

Hicks, Stephanie C, Kwame Okrah, Joseph N Paulson, John Quackenbush, Rafael A Irizarry, and Héctor Corrada Bravo. 2017. “Smooth quantile normalization.” Biostatistics 19 (2): 185–98. https://doi.org/10.1093/biostatistics/kxx028.

Ignatiadis, N., B. Klaus, J. B. Zaugg, and W. Huber. 2016. “Data-driven hypothesis weighting increases detection power in genome-scale multiple testing.” Nat Methods 13 (7): 577–80.

Korotkevich, Gennady, Vladimir Sukhov, and Alexey Sergushichev. 2019. “Fast Gene Set Enrichment Analysis.” bioRxiv. https://doi.org/10.1101/060012.

Law, C. W., Y. Chen, W. Shi, and G. K. Smyth. 2014. “voom: Precision weights unlock linear model analysis tools for RNA-seq read counts.” Genome Biol 15 (2): R29.

Lun, A. T., and G. K. Smyth. 2015. “From reads to regions: a Bioconductor workflow to detect differential binding in ChIP-seq data.” F1000Res 4: 1080.

Lun, Aaron T L, and Gordon K Smyth. 2014. “De Novo Detection of Differentially Bound Regions for ChIP-Seq Data Using Peaks and Windows: Controlling Error Rates Correctly.” Nucleic Acids Res. 42 (11): e95.

McCarthy, Davis J., and Gordon K. Smyth. 2009. “Testing significance relative to a fold-change threshold is a TREAT.” Bioinformatics 25 (6): 765–71. https://doi.org/10.1093/bioinformatics/btp053.

Subramanian, Aravind, Pablo Tamayo, Vamsi K. Mootha, Sayan Mukherjee, Benjamin L. Ebert, Michael A. Gillette, Amanda Paulovich, et al. 2005. “Gene Set Enrichment Analysis: A Knowledge-Based Approach for Interpreting Genome-Wide Expression Profiles.” Proceedings of the National Academy of Sciences 102 (43): 15545–50. https://doi.org/10.1073/pnas.0506580102.

Young, M. D., M. J. Wakefield, G. K. Smyth, and A. Oshlack. 2010. “Gene ontology analysis for RNA-seq: accounting for selection bias.” Genome Biol 11 (2): R14.

Zhang, Y., T. Liu, C. A. Meyer, J. Eeckhoute, D. S. Johnson, B. E. Bernstein, C. Nusbaum, et al. 2008. “Model-based analysis of ChIP-Seq (MACS).” Genome Biol 9 (9): R137.

R version 4.2.3 (2023-03-15)

Platform: x86_64-conda-linux-gnu (64-bit)

locale: LC_CTYPE=en_AU.UTF-8, LC_NUMERIC=C, LC_TIME=en_AU.UTF-8, LC_COLLATE=en_AU.UTF-8, LC_MONETARY=en_AU.UTF-8, LC_MESSAGES=en_AU.UTF-8, LC_PAPER=en_AU.UTF-8, LC_NAME=C, LC_ADDRESS=C, LC_TELEPHONE=C, LC_MEASUREMENT=en_AU.UTF-8 and LC_IDENTIFICATION=C

attached base packages: stats4, stats, graphics, grDevices, utils, datasets, methods and base

other attached packages: fgsea(v.1.24.0), qsmooth(v.1.14.0), quantro(v.1.32.0), metap(v.1.1), ggraph(v.2.1.0), tidygraph(v.1.2.3), extraChIPs(v.1.5.7), goseq(v.1.50.0), geneLenDataBase(v.1.34.0), BiasedUrn(v.2.0.10), msigdbr(v.7.5.1), htmltools(v.0.5.5), reactable(v.0.4.4), GenomicInteractions(v.1.32.0), InteractionSet(v.1.26.0), ggside(v.0.2.2), rlang(v.1.1.1), ggrepel(v.0.9.3), IHW(v.1.26.0), statmod(v.1.5.0), magrittr(v.2.0.3), scales(v.1.2.1), edgeR(v.3.40.0), limma(v.3.54.0), csaw(v.1.32.0), SummarizedExperiment(v.1.28.0), Biobase(v.2.58.0), MatrixGenerics(v.1.10.0), matrixStats(v.1.0.0), ngsReports(v.2.0.0), patchwork(v.1.1.2), Rsamtools(v.2.14.0), Biostrings(v.2.66.0), XVector(v.0.38.0), rtracklayer(v.1.58.0), plyranges(v.1.18.0), GenomicRanges(v.1.50.0), GenomeInfoDb(v.1.34.9), IRanges(v.2.32.0), S4Vectors(v.0.36.0), BiocGenerics(v.0.44.0), yaml(v.2.3.7), BiocParallel(v.1.32.5), pander(v.0.6.5), glue(v.1.6.2), lubridate(v.1.9.2), forcats(v.1.0.0), stringr(v.1.5.0), dplyr(v.1.1.2), purrr(v.1.0.1), readr(v.2.1.4), tidyr(v.1.3.0), tibble(v.3.2.1), ggplot2(v.3.4.2) and tidyverse(v.2.0.0)

loaded via a namespace (and not attached): rappdirs(v.0.3.3), bumphunter(v.1.40.0), minfi(v.1.44.0), bit64(v.4.0.5), knitr(v.1.43), DelayedArray(v.0.24.0), data.table(v.1.14.8), rpart(v.4.1.19), GEOquery(v.2.66.0), KEGGREST(v.1.38.0), RCurl(v.1.98-1.12), AnnotationFilter(v.1.22.0), doParallel(v.1.0.17), generics(v.0.1.3), preprocessCore(v.1.60.2), GenomicFeatures(v.1.50.2), cowplot(v.1.1.1), EnrichedHeatmap(v.1.27.2), lambda.r(v.1.2.4), RSQLite(v.2.3.1), bit(v.4.0.5), tzdb(v.0.4.0), xml2(v.1.3.4), viridis(v.0.6.3), xfun(v.0.39), hms(v.1.1.3), jquerylib(v.0.1.4), babelgene(v.22.9), evaluate(v.0.21), scrime(v.1.3.5), fansi(v.1.0.4), restfulr(v.0.0.15), progress(v.1.2.2), dbplyr(v.2.3.2), igraph(v.1.4.3), DBI(v.1.1.3), htmlwidgets(v.1.6.2), futile.logger(v.1.4.3), reshape(v.0.8.9), ellipsis(v.0.3.2), crosstalk(v.1.2.0), backports(v.1.4.1), annotate(v.1.76.0), biomaRt(v.2.54.0), deldir(v.1.0-9), sparseMatrixStats(v.1.10.0), vctrs(v.0.6.3), here(v.1.0.1), ensembldb(v.2.22.0), cachem(v.1.0.8), withr(v.2.5.0), ggforce(v.0.4.1), Gviz(v.1.42.0), BSgenome(v.1.66.3), checkmate(v.2.2.0), vroom(v.1.6.3), GenomicAlignments(v.1.34.0), fdrtool(v.1.2.17), prettyunits(v.1.1.1), mclust(v.6.0.0), cluster(v.2.1.4), lazyeval(v.0.2.2), crayon(v.1.5.2), genefilter(v.1.80.0), labeling(v.0.4.2), pkgconfig(v.2.0.3), slam(v.0.1-50), tweenr(v.2.0.2), nlme(v.3.1-162), ProtGenerics(v.1.30.0), nnet(v.7.3-19), lifecycle(v.1.0.3), filelock(v.1.0.2), BiocFileCache(v.2.6.0), dichromat(v.2.0-0.1), VennDiagram(v.1.7.3), rprojroot(v.2.0.3), polyclip(v.1.10-4), reactR(v.0.4.4), rngtools(v.1.5.2), base64(v.2.0.1), Matrix(v.1.5-4.1), lpsymphony(v.1.26.0), Rhdf5lib(v.1.20.0), zoo(v.1.8-12), base64enc(v.0.1-3), GlobalOptions(v.0.1.2), png(v.0.1-8), viridisLite(v.0.4.2), rjson(v.0.2.21), bitops(v.1.0-7), rhdf5filters(v.1.10.0), blob(v.1.2.4), DelayedMatrixStats(v.1.20.0), doRNG(v.1.8.6), shape(v.1.4.6), nor1mix(v.1.3-0), jpeg(v.0.1-10), memoise(v.2.0.1), plyr(v.1.8.8), zlibbioc(v.1.44.0), compiler(v.4.2.3), BiocIO(v.1.8.0), illuminaio(v.0.40.0), RColorBrewer(v.1.1-3), clue(v.0.3-64), cli(v.3.6.1), htmlTable(v.2.4.1), formatR(v.1.14), Formula(v.1.2-5), MASS(v.7.3-60), mgcv(v.1.8-42), tidyselect(v.1.2.0), stringi(v.1.7.12), highr(v.0.10), askpass(v.1.1), locfit(v.1.5-9.8), latticeExtra(v.0.6-30), grid(v.4.2.3), sass(v.0.4.6), VariantAnnotation(v.1.44.0), fastmatch(v.1.1-3), tools(v.4.2.3), timechange(v.0.2.0), parallel(v.4.2.3), circlize(v.0.4.15), rstudioapi(v.0.14), foreach(v.1.5.2), foreign(v.0.8-84), metapod(v.1.6.0), gridExtra(v.2.3), farver(v.2.1.1), ComplexUpset(v.1.3.3), digest(v.0.6.31), quadprog(v.1.5-8), Rcpp(v.1.0.10), siggenes(v.1.72.0), broom(v.1.0.5), httr(v.1.4.6), ggdendro(v.0.1.23), AnnotationDbi(v.1.60.0), biovizBase(v.1.46.0), ComplexHeatmap(v.2.14.0), Rdpack(v.2.4), colorspace(v.2.1-0), XML(v.3.99-0.14), splines(v.4.2.3), graphlayouts(v.1.0.0), multtest(v.2.54.0), plotly(v.4.10.2), xtable(v.1.8-4), jsonlite(v.1.8.7), futile.options(v.1.0.1), R6(v.2.5.1), Hmisc(v.5.1-0), pillar(v.1.9.0), fastmap(v.1.1.1), DT(v.0.28), beanplot(v.1.3.1), codetools(v.0.2-19), utf8(v.1.2.3), sva(v.3.46.0), lattice(v.0.21-8), bslib(v.0.5.0), curl(v.5.0.1), openssl(v.2.0.6), GO.db(v.3.16.0), interp(v.1.1-4), survival(v.3.5-5), rmarkdown(v.2.23), munsell(v.0.5.0), GetoptLong(v.1.0.5), rhdf5(v.2.42.0), GenomeInfoDbData(v.1.2.9), iterators(v.1.0.14), HDF5Array(v.1.26.0), gtable(v.0.3.3) and rbibutils(v.2.2.13)

AR Differential Binding: E2DHT Vs. E2

28 July, 2023

Outline

Differential Binding

Enrichment Analysis

Incorporation with RNA-Seq

Data Preparation

Sliding Window Counts

Normalisation

Data Inspection

QSmooth Weights

Density Plot: All Sliding Windows

logCPM Densities

RLE (Pre-Normalisation)

RLE (Post-Normalisation)

PCA

Differential Binding Analysis

Primary Analysis

Independent Hypothesis Weighting

Summary Table

Initial Group Sizes

P-Value Distributions

IHW Weights

FDR Comparison

Mapping Windows

Results

Result Tables

Overall Results

Summary By Region

Summary By Feature

Most Highly Ranked

Summary Plots

MA Plot

Volcano Plot

Heatmap: Increased Binding

Heatmap: Decreased Binding

Results By Chromosome

Signal By Region

Differential Binding By Region

Signal By Feature

Differential Binding By Feature

Differential Binding By Region And Feature

Inspection of Top-Ranked Regions

Normalised Sliding Windows

Decreased Binding (Strongest Signal)

Decreased Binding (Lowest FDR)

Decreased Binding (Most Extreme)

Increased Binding (Strongest Signal)

Increased Binding (Lowest FDR)

Increased Binding (Most Extreme)

Significant Increased Binding (No Union Peak)

Enrichment Testing

Windows With Detected AR

Results Table

Network Plot

Windows With Differentially Bound AR

Windows With Increased AR Binding

Windows With Decreased AR Binding

Comparison With RNA Seq

Distribution Of Detected Genes

Detected Genes By AR Binding Region

Detected Genes By Feature

Relationship With Differentially Expressed Genes

DE Genes Associated with Changed AR Binding

Volcano Plots By AR Binding

P-Values By AR Status

ChIP logFC Vs RNA logFC

Binding Pattern GSEA

GSEA By Direction of Regulation

All Windows

Windows By Region

Windows By Feature

GSEA By Significance Only

All Windows

Windows By Region

Windows By Feature

Integration of RNA and ChIP Enrichment Analyses

Differentially Bound Windows

Windows With Increased AR

Windows With Decreased AR