/**
* jQuery Plugin: Sticky Tabs
*
* @author Aidan Lister
// Set the correct tab when the page loads showStuffFromHash(context);
// Set the correct tab when a user uses their back/forward button $(window).on('hashchange', function() { showStuffFromHash(context); });
// Change the URL when tabs are clicked $('a', context).on('click', function(e) { history.pushState(null, null, this.href); showStuffFromHash(context); });
return this; }; }(jQuery));
window.buildTabsets = function(tocID) {
// build a tabset from a section div with the .tabset class function buildTabset(tabset) {
// check for fade and pills options var fade = tabset.hasClass("tabset-fade"); var pills = tabset.hasClass("tabset-pills"); var navClass = pills ? "nav-pills" : "nav-tabs";
// determine the heading level of the tabset and tabs var match = tabset.attr('class').match(/level(\d) /); if (match === null) return; var tabsetLevel = Number(match[1]); var tabLevel = tabsetLevel + 1;
// find all subheadings immediately below var tabs = tabset.find("div.section.level" + tabLevel); if (!tabs.length) return;
// create tablist and tab-content elements var tabList = $('
'); $(tabs[0]).before(tabList); var tabContent = $('
'); $(tabs[0]).before(tabContent);
// build the tabset var activeTab = 0; tabs.each(function(i) {
// get the tab div var tab = $(tabs[i]);
// get the id then sanitize it for use with bootstrap tabs var id = tab.attr('id');
// see if this is marked as the active tab if (tab.hasClass('active')) activeTab = i;
// remove any table of contents entries associated with // this ID (since we'll be removing the heading element) $("div#" + tocID + " li a[href='#" + id + "']").parent().remove();
// sanitize the id for use with bootstrap tabs id = id.replace(/[.\/?&!#<>]/g, '').replace(/\s/g, '_'); tab.attr('id', id);
// get the heading element within it, grab it's text, then remove it var heading = tab.find('h' + tabLevel + ':first'); var headingText = heading.html(); heading.remove();
// build and append the tab list item var a = $('' + headingText + ''); a.attr('href', '#' + id); a.attr('aria-controls', id); var li = $('
'); li.append(a); tabList.append(li);
// set it's attributes tab.attr('role', 'tabpanel'); tab.addClass('tab-pane'); tab.addClass('tabbed-pane'); if (fade) tab.addClass('fade');
// move it into the tab content div tab.detach().appendTo(tabContent); });
// set active tab $(tabList.children('li')[activeTab]).addClass('active'); var active = $(tabContent.children('div.section')[activeTab]); active.addClass('active'); if (fade) active.addClass('in');
if (tabset.hasClass("tabset-sticky")) tabset.rmarkdownStickyTabs(); }
// convert section divs with the .tabset class to tabsets var tabsets = $("div.section.tabset"); tabsets.each(function(i) { buildTabset($(tabsets[i])); }); };
MA plots with ggplot: easy R tutorial
In this blogpost, we will learn how to create our own MA plot with ggplot, following an easy step-by-step tutorial in R. If you haven’t yet, check out my other blogpost where I go over the basics of this visualisation and how to interpret an MA plot. Otherwise, let’s learn how to create and customise an MA plot in R.
So if you are ready… let’s dive in!
MA plots in R: easy tutorial
Installing necessary packages
For this tutorial you will need R, or Rstudio, and you will need to install the following packages:
if (!require("BiocManager", quietly = TRUE))
install.packages("BiocManager")
# For basic plotting
install.packages(c("ggplot2", "dplyr", "gridExtra"))
# For bioinformatics applications
BiocManager::install(c("limma", "edgeR", "DESeq2"))
# Additionally, we'll use the dataset "airway" for our analysis
# https://bioconductor.org/packages/release/data/experiment/html/airway.html
BiocManager::install('airway')
Setting Up the Environment
First, let’s set up our R environment and load the necessary packages.
# Clean environment and optimize settings
rm(list = ls(all.names = TRUE))
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 621111 33.2 1423468 76.1 700316 37.5
## Vcells 1157485 8.9 8388608 64.0 1876568 14.4
options(max.print = .Machine$integer.max, scipen = 999, stringsAsFactors = F, dplyr.summarise.inform = F)
# Loading relevant libraries
library(tidyverse)
library(gridExtra)
library(limma)
library(edgeR)
library(DESeq2)
library(airway)
# Set seed for reproducibility
set.seed(123)
SquidTip! Always clean your environment and set a seed to ensure reproducible results.
Loading sample patient data
For this tutorial, we’ll use public RNAseq dataset, (airway). It has read counts for an RNA-Seq experiment on four human airway smooth muscle cell lines treated with dexamethasone. Feel free to skip this section if you already have your own dataset.
In this case, we have 4 samples and 2 conditions, treated (trt) and untreated (untrt). The format in this case is a SummarizedExperiment, but you can create an MA plot from many different formats.
Essentially, what you need is two vectors of values you want to compare.
# Get sample expression data
data("airway")
table(airway$cell, airway$dex)
##
## trt untrt
## N052611 1 1
## N061011 1 1
## N080611 1 1
## N61311 1 1
class(airway)
## [1] "RangedSummarizedExperiment"
## attr(,"package")
## [1] "SummarizedExperiment"
airway@colData %>% head()
## DataFrame with 6 rows and 9 columns
## SampleName cell dex albut Run avgLength
## <factor> <factor> <factor> <factor> <factor> <integer>
## SRR1039508 GSM1275862 N61311 untrt untrt SRR1039508 126
## SRR1039509 GSM1275863 N61311 trt untrt SRR1039509 126
## SRR1039512 GSM1275866 N052611 untrt untrt SRR1039512 126
## SRR1039513 GSM1275867 N052611 trt untrt SRR1039513 87
## SRR1039516 GSM1275870 N080611 untrt untrt SRR1039516 120
## SRR1039517 GSM1275871 N080611 trt untrt SRR1039517 126
## Experiment Sample BioSample
## <factor> <factor> <factor>
## SRR1039508 SRX384345 SRS508568 SAMN02422669
## SRR1039509 SRX384346 SRS508567 SAMN02422675
## SRR1039512 SRX384349 SRS508571 SAMN02422678
## SRR1039513 SRX384350 SRS508572 SAMN02422670
## SRR1039516 SRX384353 SRS508575 SAMN02422682
## SRR1039517 SRX384354 SRS508576 SAMN02422673
Before we go into the actual MA plot, let’s do some basic preprocessing. It’s important to use normalised counts when comparing two samples. If you use raw counts, you’ll introduce:
- Mean-variance dependence (low-count genes have high variance)
- Bias in fold change estimates, especially for low-expression genes
So let’s normalise our values first. We will use the R package DESeq2. I won’t worry too much about batch effects etc, I’ll just add treatment as a covariate (but definitely check for this if you’re using your own data!).
# Create DESeqDataSet
dds <- DESeqDataSet(airway, design = ~ dex)
# Run DESeq() to estimate size factors and normalize
dds <- DESeq(dds)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
MA plot from a dataframe
First, we’ll learn how to create an MA plot from a dataframe, or two vectors: one for each condition. This will allow you to apply the code to a wide range of formats.
As an example, let’s imagine we just want to compare the results for one sample: N052611. We’ll create an MA plot showing treated vs untreated results for that sample specifically.
# Filter for N052611 cell line only
norm_counts <- counts(dds, normalized = TRUE)
norm_counts[1:5,1:5]
## SRR1039508 SRR1039509 SRR1039512 SRR1039513 SRR1039516
## ENSG00000000003 663.31418 499.90698 740.15284 608.90630 966.3137
## ENSG00000000005 0.00000 0.00000 0.00000 0.00000 0.0000
## ENSG00000000419 456.21167 574.66985 526.50047 544.73235 498.4413
## ENSG00000000457 253.99365 235.44726 222.97846 244.75646 208.0377
## ENSG00000000460 58.61392 61.37251 33.91307 52.23461 66.2324
# Find the treated and untreated samples for N052611
sample_data <- colData(dds)
print(as.data.frame(sample_data))
## SampleName cell dex albut Run avgLength Experiment
## SRR1039508 GSM1275862 N61311 untrt untrt SRR1039508 126 SRX384345
## SRR1039509 GSM1275863 N61311 trt untrt SRR1039509 126 SRX384346
## SRR1039512 GSM1275866 N052611 untrt untrt SRR1039512 126 SRX384349
## SRR1039513 GSM1275867 N052611 trt untrt SRR1039513 87 SRX384350
## SRR1039516 GSM1275870 N080611 untrt untrt SRR1039516 120 SRX384353
## SRR1039517 GSM1275871 N080611 trt untrt SRR1039517 126 SRX384354
## SRR1039520 GSM1275874 N061011 untrt untrt SRR1039520 101 SRX384357
## SRR1039521 GSM1275875 N061011 trt untrt SRR1039521 98 SRX384358
## Sample BioSample sizeFactor
## SRR1039508 SRS508568 SAMN02422669 1.0236476
## SRR1039509 SRS508567 SAMN02422675 0.8961667
## SRR1039512 SRS508571 SAMN02422678 1.1794861
## SRR1039513 SRS508572 SAMN02422670 0.6700538
## SRR1039516 SRS508575 SAMN02422682 1.1776714
## SRR1039517 SRS508576 SAMN02422673 1.3990365
## SRR1039520 SRS508579 SAMN02422683 0.9207787
## SRR1039521 SRS508580 SAMN02422677 0.9445141
# For N052611, find treated and untreated samples (you can also figure it out by looking at the metadata directly)
n052611_samples <- rownames(sample_data)[sample_data$cell == "N052611"]
untreated_sample <- n052611_samples[sample_data[n052611_samples, "dex"] == "untrt"]
treated_sample <- n052611_samples[sample_data[n052611_samples, "dex"] == "trt"]
cat("Untreated:", untreated_sample, "\n")
## Untreated: SRR1039512
cat("Treated:", treated_sample, "\n")
## Treated: SRR1039513
Now, I’ll create a dataframe which will be the input for my MA plot.
# Extract the counts as a dataframe
expr_df <- as.data.frame(norm_counts[, c(untreated_sample, treated_sample)])
expr_df[1:5,1:2]
## SRR1039512 SRR1039513
## ENSG00000000003 740.15284 608.90630
## ENSG00000000005 0.00000 0.00000
## ENSG00000000419 526.50047 544.73235
## ENSG00000000457 222.97846 244.75646
## ENSG00000000460 33.91307 52.23461
# Let's change the column names to the dex column (treated or untreated) instead of the RunID to make it clearer
# Careful when replacing column names! Make sure you are matching the right name.
expr_df <- expr_df %>% dplyr::rename(untrt = untreated_sample,
trt = treated_sample)
## Warning: Using an external vector in selections was deprecated in tidyselect 1.1.0.
## ℹ Please use `all_of()` or `any_of()` instead.
## # Was:
## data %>% select(untreated_sample)
##
## # Now:
## data %>% select(all_of(untreated_sample))
##
## See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: Using an external vector in selections was deprecated in tidyselect 1.1.0.
## ℹ Please use `all_of()` or `any_of()` instead.
## # Was:
## data %>% select(treated_sample)
##
## # Now:
## data %>% select(all_of(treated_sample))
##
## See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
head(expr_df)
## untrt trt
## ENSG00000000003 740.152838 608.90630
## ENSG00000000005 0.000000 0.00000
## ENSG00000000419 526.500473 544.73235
## ENSG00000000457 222.978461 244.75646
## ENSG00000000460 33.913074 52.23461
## ENSG00000000938 1.695654 0.00000
Nice! Now we can calculate the M and A values using our two vectors, one for untreated values and one for treated values.
SquidTip! Note that we usually do treatment compared to control. If you switch around the order, you will get an inverted plot - the results are the same, but you would be comparing genes that are higher/lower in control compared to treatment.
# Calculate M and A values
M <- log1p(expr_df$trt) - log1p(expr_df$untrt) # Log fold change
A <- (log1p(expr_df$untrt) + log1p(expr_df$trt)) / 2 # Average log expression
SquidTip! I use the log1p function to calculate log because it automatically adds a small number to avoid errors when an expression value is 0 (since you cannot compute log(0)). But you can use log2(expr_df$untrt + 0.01), for example.
Now we’ll put it all together in a dataframe, marking genes with M (log2FC) > 1 as differentially expressed in treatment vs control. Depending on your dataset, you might want to add a lower or higher threshold, but the MA plot is mostly used to give you an idea of how many genes are different in one sample versus another, so don’t worry too much about choosing a threshold - you can also plot several thresholds!
It’s important to note that we are not doing any statistical test, we cannot say whether the difference is significant or not.
# Create data frame
ma_data <- data.frame(A = A,
M = M,
Gene = row.names(expr_df),
Differentially_expressed = abs(M) > 1)
head(ma_data)
## A M Gene Differentially_expressed
## 1 6.5107561 -0.19490152 ENSG00000000003 FALSE
## 2 0.0000000 0.00000000 ENSG00000000005 FALSE
## 3 6.2851392 0.03397890 ENSG00000000419 FALSE
## 4 5.4579455 0.09279114 ENSG00000000457 FALSE
## 5 3.7637851 0.42184737 ENSG00000000460 FALSE
## 6 0.4958204 -0.99164073 ENSG00000000938 FALSE
# Looks like we have 13285 significant genes!
table(ma_data$Differentially_expressed)
##
## FALSE TRUE
## 61352 2325
Easy! Now we can create our MA plot using ggplot2!
p1 <- ggplot(ma_data, aes(x = A, y = M)) +
geom_point(aes(color = Differentially_expressed), alpha = 0.6, size = 1.2) +
scale_color_manual(values = c("FALSE" = "gray60", "TRUE" = "violet")) +
geom_hline(yintercept = 0, color = "darkblue", linetype = "dashed", linewidth = 1) +
labs(title = "MA Plot: treated vs untreated (N052611)",
x = "A",
y = "M",
color = "High differential expression") +
theme_bw() +
theme(legend.position = "top")
print(p1)
Amazing! We’ve created a beautiful MA plot.
We see that most genes are pretty close to M = 0, in other words, they’re not differentially expressed in the treated versus untreated sample. There seems to be more genes that have lower expression in the treated versus untreated, than higher.
Just FYI, dexamethasone is a synthetic corticosteroid (or glucocorticoid), commonly used as a potent anti-inflammatory and immunosuppressant drug. So we expect it to cause widespread transcriptional changes, namely upregulation of anti-inflammatory cytokines, glutocorticoids etc and downregulation of genes related with inflammation (chemokines, etc).
Add gene annotations
You might also want to highlight certain genes, in other words, label the points of your MA plot. It’s really easy to do that with geom_text_repel from R package ggrepel! Click here for more ways to edit your labels with ggrepel.
# For example, let's label top 20 genes by fold change
genes_to_label <- ma_data %>%
dplyr::filter(Differentially_expressed == TRUE) %>%
dplyr::arrange(desc(abs(M))) %>%
dplyr::slice(1:20)
print(genes_to_label)
## A M Gene Differentially_expressed
## 1 2.436989 4.873978 ENSG00000179593 TRUE
## 2 4.059786 4.807259 ENSG00000109906 TRUE
## 3 5.145573 4.091351 ENSG00000127954 TRUE
## 4 4.769164 3.859152 ENSG00000171819 TRUE
## 5 2.507862 3.787703 ENSG00000264868 TRUE
## 6 2.884966 3.786650 ENSG00000250978 TRUE
## 7 2.738575 -3.650643 ENSG00000146006 TRUE
## 8 4.823439 3.524976 ENSG00000143127 TRUE
## 9 3.116247 3.273240 ENSG00000101342 TRUE
## 10 1.607006 3.214011 ENSG00000249364 TRUE
## 11 1.604194 -3.208387 ENSG00000263567 TRUE
## 12 2.572768 3.162254 ENSG00000168481 TRUE
## 13 1.576074 3.152149 ENSG00000257322 TRUE
## 14 5.381939 3.146691 ENSG00000163884 TRUE
## 15 4.968381 3.089936 ENSG00000168309 TRUE
## 16 1.543103 3.086205 ENSG00000162267 TRUE
## 17 5.954660 3.025551 ENSG00000152583 TRUE
## 18 3.304617 2.996936 ENSG00000173838 TRUE
## 19 1.489094 -2.978189 ENSG00000157005 TRUE
## 20 3.410935 2.948769 ENSG00000100033 TRUE
# Now, use geom_text_repel to highlight these genes
p1 <- ggplot(ma_data, aes(x = A, y = M)) +
geom_point(aes(color = Differentially_expressed), alpha = 0.6, size = 1.2) +
scale_color_manual(values = c("FALSE" = "gray60", "TRUE" = "violet")) +
geom_hline(yintercept = 0, color = "darkblue", linetype = "dashed", linewidth = 1) +
labs(title = "MA Plot: treated vs untreated (N052611)",
x = "A",
y = "M",
color = "High differential expression") +
ggrepel::geom_text_repel(data = genes_to_label,
aes(label = Gene),
size = 3, color = "black",
max.overlaps = Inf, box.padding = 0.4,
point.padding = 0.3, segment.color = "grey50"
) +
theme_bw() +
theme(legend.position = "top")
print(p1)
Nice! We’ve successfully created an MA plot for one of the samples, comparing the treated vs untreated condition.
What if we wanted to compare treated vs untreated across all samples?
MA plot with replicates
To create an MA plot comparing treated vs untreated using all samples, the steps are pretty much the same. We first need to extract the results table from the DESeq analysis which will give us base means across samples, log2 fold changes, standard errors, test statistics, p-values and adjusted p-values.
# Get results comparing treated vs untreated across all samples
res <- results(dds, contrast = c("dex", "trt", "untrt"))
# Convert to data frame
res_df <- as.data.frame(res)
res_df$gene <- rownames(res_df)
# Remove NAs and infinite values
res_df <- res_df[complete.cases(res_df), ]
head(res_df)
## baseMean log2FoldChange lfcSE stat pvalue
## ENSG00000000003 708.60217 -0.37884695 0.1731411 -2.1880820 0.02866363
## ENSG00000000419 520.29790 0.20376039 0.1005987 2.0254780 0.04281831
## ENSG00000000457 237.16304 0.03404282 0.1262790 0.2695842 0.78748018
## ENSG00000000460 57.93263 -0.11717864 0.3012365 -0.3889921 0.69728196
## ENSG00000000971 5817.35287 0.44095895 0.2589239 1.7030445 0.08855974
## ENSG00000001036 1282.10639 -0.24194234 0.1196513 -2.0220625 0.04316989
## padj gene
## ENSG00000000003 0.1393079 ENSG00000000003
## ENSG00000000419 0.1833591 ENSG00000000419
## ENSG00000000457 0.9305715 ENSG00000000457
## ENSG00000000460 0.8954410 ENSG00000000460
## ENSG00000000971 0.2993058 ENSG00000000971
## ENSG00000001036 0.1842852 ENSG00000001036
If we check the results dataframe we get from DESeq2, we don’t even need to compute M & A! They’re already there: columns baseMean and log2FoldChange.
Let’s harness the p-value/padj value from the DGE analysis results to highlight genes that are statistically differentially expressed in an MA plot. Remember, the MA plot just plots M (log2FC) versus A (average expression), so you cannot know whether a gene has statistically different expression in one condition versus another by only checking at the position of the point/gene in the plot. For that, we use a volcano plot.
Instead, we will plot an MA plot but highlight significant DEGs (differentially expressed genes) in a different colour. Let’s mark genes as significant if they have M > 1 and padj < 0.05.
# Add significance categories
res_df$significance <- "Not significant"
res_df$significance[res_df$padj < 0.05 & abs(res_df$log2FoldChange) > 1] <- "Significant (padj < 0.05, |FC| > 2)"
res_df$significance[res_df$padj < 0.05 & abs(res_df$log2FoldChange) <= 1] <- "Significant (padj < 0.05, |FC| ≤ 2)"
p1 <- ggplot(res_df, aes(x = baseMean, y = log2FoldChange, color = significance)) +
geom_point(alpha = 0.6, size = 1.2) +
scale_x_log10() +
scale_color_manual(
values = c("Not significant" = "gray60",
"Significant (padj < 0.05, |FC| ≤ 2)" = "lightblue",
"Significant (padj < 0.05, |FC| > 2)" = "darkblue")
) +
geom_hline(yintercept = 0, color = "black", linetype = "dashed", linewidth = 0.8) +
geom_hline(yintercept = c(-1, 1), color = "blue", linetype = "dotted", linewidth = 0.5) +
labs(title = "MA Plot: All Treated vs All Untreated Samples",
x = "Mean of normalized counts (log10 scale)",
y = "log2 fold change (Treated/Untreated)",
color = "Differential Expression") +
theme_bw() +
theme(plot.title = element_text(face = "bold", size = 14),
plot.subtitle = element_text(size = 10, color = "gray40"),
axis.title = element_text(size = 11),
legend.position = "bottom",
legend.title = element_text(face = "bold"),
panel.grid.minor = element_blank()) +
guides(color = guide_legend(override.aes = list(size = 3, alpha = 1)))
print(p1)
Nice! We can also build a quick volcano plot. Follow this tutorial if you’d like to learn how to build a volcano plot step-by-step.
res_df <- res_df %>%
mutate(significant = pvalue < 0.05)
ggplot(res_df, aes(x = log2FoldChange, y = -log10(pvalue))) +
geom_point(aes(color = significant)) +
scale_color_manual(values = c("TRUE" = "#D60093", "FALSE" = "gray")) +
geom_hline(yintercept = -log10(0.05), linetype = "dashed", color = "black") +
coord_cartesian(xlim = c(-5, 5), ylim = c(0, 50)) +
theme_bw() +
labs(
x = expression(Log[2]~Fold~Change),
y = expression(-Log[10]~p~value),
title = "Volcano Plot (DESeq2 results)",
color = "Significant (p < 0.05)"
)
Squidtastic! And that’s the end of this tutorial.
In this post, we covered how to interpret and create MA plots in R. Hope you found it useful!
As you saw, MA plots are a great way to compare two samples and see which genes are differentially expressed. As key takeaways, when looking at an MA plot, don’t forget about:
- Centering: The horizontal line at M = 0 represents no change between conditions
- Spread: Wide spread in M values at low A values often indicates technical noise
- Bias: Systematic deviation from M = 0 may indicate technical bias
- Significance: Consider both fold change magnitude and statistical significance
Additional resources
You might be interested in…
- MA plots easily explained
- MA plots in python
- Volcano plots in R
- You can also plot MA plots using R package ggmaplot
Squidtastic! You made it till the end! Hope you found this post useful.
If you have any questions, or if there are any more topics you would like to see here, leave me a comment down below.
Otherwise, have a very nice day and… see you in the next one!
Before you go, you might want to check:
// add bootstrap table styles to pandoc tables function bootstrapStylePandocTables() { $('tr.odd').parent('tbody').parent('table').addClass('table table-condensed'); } $(document).ready(function () { bootstrapStylePandocTables(); });