This workshop demonstrates how to perform ELMER analysis using matched RNA-seq and DNA methylation data.
You can find a detailed information about ELMER at http://bioconductor.org/packages/3.10/bioc/vignettes/ELMER/inst/doc/index.html.
Articles about ELMER:
SummarizedExperiment
classesStudents will have a chance to run ELMER analysis on a provided MultiAssayExperiment
object created from TCGA data from GDC data portal.
MultiAssayExperiment
objectFirst, we need to load the R packages used in this workshop, they should provide the required functions to run the code.
suppressMessages({
library("ELMER")
library("MultiAssayExperiment")
})
The RNA-seq data, DNA methylation data and patients metadata (i.e age, gender) used in this workshop is
structured as a MultiAssayExperiment
. You can read more about a MultiAssayExperiment
object at http://bioconductor.org/packages/MultiAssayExperiment/
and https://bioconductor.github.io/BiocWorkshops/workflow-for-multi-omics-analysis-with-multiassayexperiment.html.
Through the next section we will:
The data which is available in this machine, can also be downloaded at this google drive.
mae <- readRDS("Data/TCGA_ESCA_MAE_distal_regions.rds")
mae
colnames(mae)
# check experiments
experiments(mae)
# Get the Gene expression object
rna.seq <- mae[[2]]
message("Number of genes ",nrow(rna.seq))
message("Number of samples ",ncol(rna.seq))
message("Check genes metadata")
rowRanges(rna.seq)
message("Check gene expression data")
assay(rna.seq)[1:4,1:4]
# Get the DNA methylation object
dna.met <- mae[[1]]
message("Number of probes: ", nrow(dna.met))
message("Number of samples: ", ncol(dna.met))
message("Check DNA methylation probes metadata")
rowRanges(dna.met)[,1:4]
message("Check DNA methylation beta values")
assay(dna.met)[1:4,1:4]
message("metadata can be accessed using the function colData")
message("we will check the first 4 samples and their 4 columns")
colData(mae)[1:4,1:4]
message("or you can also access the metadata directly with the $")
mae$primary_diagnosis[1:4]
message("summarize the number of tissue type with the function table")
table(mae$definition)
message("summarize the numbers of samples using two features")
table(mae$primary_diagnosis,mae$definition)
Since we will use only Primary solid Tumor samples we will remove the Metastatic and Solid Tissue Normal samples from our object
mae <- mae[, mae$definition == "Primary solid Tumor"]
# which is the column of the samples metadata defining our groups
group.col <- "primary_diagnosis"
# which are the grops withing the group.col that we want to compare
group1 <- "Adenocarcinoma, NOS"
group2 <- "Squamous cell carcinoma, NOS"
# Identify probes hypo in group1 or hypo in group2 ?
direction <- "hypo" # hypo in group 1
cores <- 2
mode <- "supervised"
# Where do we want to save the results ?
dir.out <- paste0("analysis/",group.col,"-",
gsub("[[:punct:]]| ", "_", group1),
"_vs_",
gsub("[[:punct:]]| ", "_", group2),
"_", mode,
"/", direction)
message("Results will be saved at: ",dir.out)
dir.create(dir.out, showWarnings = FALSE,recursive = T)
# time expected with one core: 4 minuntes
diff.probes <- get.diff.meth(data = mae,
group.col = group.col,
group1 = group1,
group2 = group2,
diff.dir = direction, # Get probes hypometh. in group 1
cores = cores,
mode = mode,
pvalue = 0.01,
sig.dif = 0.3,
dir.out = dir.out,
save = TRUE)
# check the results
head(diff.probes)
Now that we have our hypomethylated probes we will look for the 20 nearest genes
to each probes to identify if any of these genes could being affected by the
hypomethylation in the distal regulatory region.
# time expected less than one minute
nearGenes <- GetNearGenes(data = mae,
probes = diff.probes$probe,
numFlankingGenes = 20)
head(nearGenes)
Now that we have our hypomethylated probes and the 20 nearest genes we want to identify if any of these genes could being affected by the hypomethylatio in the distal regulatory region.
# time expected one minute
pair <- get.pair(data = mae,
nearGenes = nearGenes,
group.col = group.col,
group1 = group1,
group2 = group2,
mode = mode,
diff.dir = direction,
raw.pvalue = 0.001,
Pe = 0.001,
filter.probes = TRUE,
filter.percentage = 0.05,
filter.portion = 0.3,
dir.out = dir.out,
cores = cores,
label = direction)
head(pair)
We can plot the correlation heatmap of DNA methylation and gene expression
with the heatmapPairs
function. We will plot only the first 100 pairs.
# Plot size options
options(repr.plot.width=12, repr.plot.height=7)
heatmapPairs(data = mae,
group.col = group.col,
group1 = group1,
group2 = group2,
subset = TRUE,
pairs = pair[1:100,],
filename = NULL)
We have hypomethylated regions that has a distal gene upregulated.
We want to identify which is the MR TF binding to those regions and regulating the distal gene.
First we identify which are the enriched motifs for those hypomethylated regions using
the function get.enriched.motif
. This will output the enriched motifs and the respective
hypomethylated probes with the motif signature around it
(since a DNA methylation probes is 1bp, we extended the search window to $\pm$ 250 bp).
# time expected less than one minute
enriched.motif <- get.enriched.motif(data = mae,
probes = unique(pair$Probe),
dir.out = dir.out,
label = direction,
min.incidence = 10,
lower.OR = 1.5)
message("check top 5 enriched motifs")
names(enriched.motif)[1:5]
message("Check the paired hypomethylated probes with the motif signature")
enriched.motif$GATA4_HUMAN.H11MO.0.A
message("Plot enrichement analysis results")
motif.result.file <- file.path(dir.out, "getMotif.hypo.motif.enrichment.csv")
# Plot size options
options(repr.plot.width=12, repr.plot.height=7)
motif.enrichment.plot(motif.enrichment = motif.result.file,
significant = list(lowerOR = 2.0),
label = "hypo",
summary = TRUE,
save = FALSE)
Now that we have our enriched motifs, we have the following question:
It is important to highlight that since TF within the same family and subfamily have very close motifs, they will likely be indentified to bind the same regions. You can check that by verifying the intersection of probes from our enrichment analysis results.
So more than 70% of probes are enriched for both GATA4 and GATA6. How could we infer which of those MR might be the one with higher impact ?
For that ELMER correlates the TF expression vs the average mean methylation of the hypomethylated probes with a motif signature and rank them. We expect that a TF with higher expression and lower methylation in the binding regions will be a better MR TF candidate than TF that are for example equally expressed on both groups.
# time expected ~5 minutes
TF <- get.TFs(data = mae,
group.col = group.col,
mode = mode,
group1 = group1,
group2 = group2,
diff.dir = direction,
enriched.motif = enriched.motif,
dir.out = dir.out,
label = direction,
cores = cores)
message("Check the results for the GATA4_HUMAN.H11MO.0.A motif")
TF["GATA4_HUMAN.H11MO.0.A",1:5]
ELMER provides an easy way to visualize the rank of all human TF evaluated with the
function TF.rank.plot
.
# Plot size options
options(repr.plot.width=7, repr.plot.height=7)
load(file.path(dir.out,"getTF.hypo.TFs.with.motif.pvalue.rda"))
motif <- "GATA4_HUMAN.H11MO.0.A"
TF.rank.plot(motif.pvalue = TF.meth.cor,
motif = motif,
save = FALSE)
When you perform several ELMER analysis with different arguments, you might want to summarize and visualize those several results. For that, we have the ELMER:::get.tabs
function , which accepts as input
the directories of the results and if you want to select the MR TF within the
family of subfamily classification.
In the Data/analysis_april_2019
folder we provided 4 ELMER analysis. The first step is to retrieve the directory names with the results. Also, it is a golden rule when running several ELMER runs to add a understable name to the dir.out
directories, such that you easily know which analysis were ran, with which arguments.
In the example below, you can identify the tumor type, type of ELMER run (supervised/unsupervised), groups compared and direction of the analysis (hypo or hyper in group 1).
analsysis.dir <- unique(dirname(dir("Data/analysis_april_2019/",recursive = T,full.names = T)))
analsysis.dir
With those directory names we will use the function ELMER:::get.tabs
to summarize the results.
classification <- "subfamily"
suppressWarnings({
suppressMessages({
tabs <- ELMER:::get.tabs(analsysis.dir,classification)
})
})
# A list of matrices are returned
print(class(tabs))
# names of the matrices
print(names(tabs))
# The default it the name directory as column name
tabs$tab[1:3,1:4]
# To be more readable we will change the path names to the analysis name
analysis.names <- c( "COAD-READ (vs Normal)",
"EAC (vs Normal)",
"EAC (vs ESCC)",
"PAAD (vs Normal)")
for (i in 1:3) colnames(tabs[[i]]) <- analysis.names
This function creates 4 matrices using some of the files in each analysis directory:
getTF.hypo.significant.TFs.with.motif.summary.csv
(create after you run the get.TFs
function) file that has the MR TF for each enriched motif.# 1. a binary matrix saying if the TF was found or not in the analysis
tab.binary <- tabs$tab
head(tab.binary)
getTF.hypo.significant.TFs.with.motif.summary.csv
to identify the MR TF and getTF.hypo.TFs.with.motif.pvalue.rda
, which is a matrix with TF as columns and enriched motifs as rows and the contains the p-value of the wilcoxon test of the TF expression for the Unmethylated vs Methylated groups.# 2. a matrix with the MR TF p-value found in the analysis
tab.pval <- tabs$tab.pval
head(tab.pval)
getTF.hypo.significant.TFs.with.motif.summary.csv
to get the MR TF and then uses getMotif.hypo.motif.enrichment.csv
to get the motifs OR to which the MR TF is known to bind and returns the highest OR.# 3. a matrix with the highest motif OR for the MR TF found in the analysis
tab.or <- tabs$tab.or
head(tab.or)
# 4. a matrix with 4 collums: MR TF, motif, FDR and analysis it was identified.
tf.or.table <- tabs$tf.or.table
# Rename analysis
tf.or.table$analysis <- gsub("ESCA","EAC",paste0(basename(dirname(dirname(dirname(tf.or.table$analysis)))),
" (vs ",
ifelse(grepl("Normal",tf.or.table$analysis),"Normal","ESCC"),")"))
head(tf.or.table)
With those summarized results we can create two summary plots:
The code below creates the heatmap using complexHeatmap package for the binary MR TF matrix and the p-value one.
library(ComplexHeatmap)
library(vegan)
options(repr.plot.width=14, repr.plot.height=8)
col <- ifelse(classification == "family","top.potential.TF.family", "top.potential.TF.subfamily")
analysis.colors <- c(
"COAD-READ (vs Normal)" = '#ff964f',
"EAC (vs Normal)" = '#94e894',
"EAC (vs ESCC)" = '#eaadb9',
"PAAD (vs Normal)" = '#9ab9f9'
)
# get top TFs in each anaylsis
labels <- ELMER:::get.top.tf.by.pval(tab.pval,top = nrow(tab.pval))
# Annotate each column of the matrix (it should be correspondant to the input matrix)
hb = HeatmapAnnotation(df = data.frame("Mode" = rep("unsupervised",4),
"Analysis" = analysis.names),
col = list("Analysis" = analysis.colors,
"Mode" = c("unsupervised" = "#ebd540",
"supervised" = "#718da5")),
show_annotation_name = FALSE,
show_legend = T,
annotation_name_side = "left",
annotation_name_gp = gpar(fontsize = 12))
# Binary heatmap
ht_list_binary <- Heatmap(as.matrix(tab.binary),
name = "Binary heatmap" ,
col = c("1" = "black", "0" = "white"),
column_names_gp = gpar(fontsize = 8),
show_column_names = FALSE,
use_raster = TRUE,
na_col = "white",
raster_device = c("png"),
raster_quality = 2,
show_row_names = F,
show_heatmap_legend = TRUE,
cluster_columns = FALSE,
cluster_rows = TRUE,
show_column_dend = FALSE,
show_row_dend = FALSE,
clustering_distance_rows = function(x) {
vegan::vegdist(tab.binary,method = "jaccard",binary = TRUE)
},
clustering_method_rows = "average",
top_annotation = hb,
width = unit(5, "cm"),
row_names_gp = gpar(fontsize = 1),
column_title = paste0("Binrary MR TF Heatmap"),
column_title_gp = gpar(fontsize = 10),
row_title_gp = gpar(fontsize = 16))
ht_list_binary <- ht_list_binary +
rowAnnotation(link = anno_mark(at = match(labels,rownames(tab.pval)),
labels = labels,labels_gp = gpar(fontsize = 10)),
width = unit(1, "cm") + max_text_width(labels)
)
draw(ht_list_binary,
newpage = FALSE,
column_title_gp = gpar(fontsize = 12, fontface = "bold"),
heatmap_legend_side = "left",
show_heatmap_legend = TRUE,
annotation_legend_side = "right")
# P-value heatmap
ht_list <- Heatmap(scale(as.matrix(-log10(tab.pval))),
name = "column z-score FDR",
col = colorRampPalette(c('#3361A5', '#248AF3', '#14B3FF',
'#88CEEF', '#C1D5DC', '#EAD397',
'#FDB31A','#E42A2A', '#A31D1D'))(100),
column_names_gp = gpar(fontsize = 8),
show_column_names = FALSE,
use_raster = TRUE,
na_col = "white",
raster_device = c("png"),
raster_quality = 2,
show_row_names = F,
show_heatmap_legend = TRUE,
cluster_columns = FALSE,
cluster_rows = FALSE,
row_order = row_order(ht_list_binary),
top_annotation = hb,
width = unit(5, "cm"),
row_names_gp = gpar(fontsize = 1),
column_title = paste0("MR TF Heatmap"),
column_title_gp = gpar(fontsize = 10),
row_title_gp = gpar(fontsize = 16))
ht_list <- ht_list +
rowAnnotation(link = anno_mark(at = match(labels,rownames(tab.pval)),
labels = labels,labels_gp = gpar(fontsize = 10)),
width = unit(1, "cm") + max_text_width(labels)
)
draw(ht_list,
newpage = TRUE,
column_title_gp = gpar(fontsize = 12, fontface = "bold"),
heatmap_legend_side = "left",
show_heatmap_legend = TRUE,
annotation_legend_side = "right")