mashr is a Bayesian statistical method to borrow information across genes and cell type (Urbut, et al, 2019). mashr takes estimated log fold changes and standard errors for each cell type and gene from dreamlet
, and produces posterior estimates with more accuracy and precision then the original parameter estimates.
dreamlet
analysisHere single cell RNA-seq data is downloaded from ExperimentHub
library(dreamlet)
library(muscat)
library(ExperimentHub)
library(zenith)
library(scater)
# Download data, specifying EH2259 for the Kang, et al study
eh <- ExperimentHub()
sce <- eh[["EH2259"]]
# only keep singlet cells with sufficient reads
sce <- sce[rowSums(counts(sce) > 0) > 0, ]
sce <- sce[,colData(sce)$multiplets == 'singlet']
# compute QC metrics
qc <- perCellQCMetrics(sce)
# remove cells with few or many detected genes
ol <- isOutlier(metric = qc$detected, nmads = 2, log = TRUE)
sce <- sce[, !ol]
# compute normalized data
sce <- sce[rowSums(counts(sce) > 1) >= 10, ]
sce <- computeLibraryFactors(sce)
sce <- logNormCounts(sce)
# set variable indicating stimulated (stim) or control (ctrl)
sce$StimStatus = sce$stim
# Since 'ind' is the individual and 'StimStatus' is the stimulus status,
# create unique identifier for each sample
sce$id <- paste0(sce$StimStatus, sce$ind)
# Create pseudobulk data by specifying cluster_id and sample_id
# Count data for each cell type is then stored in the `assay` field
# assay: entry in assayNames(sce) storing raw counts
# cluster_id: variable in colData(sce) indicating cell clusters
# sample_id: variable in colData(sce) indicating sample id for aggregating cells
pb <- aggregateToPseudoBulk(sce,
assay = "counts",
cluster_id = "cell",
sample_id = "id",
verbose = FALSE)
dreamlet
for pseudobulk# Normalize and apply voom/voomWithDreamWeights
res.proc = processAssays( pb, ~ StimStatus, min.count=5)
# Differential expression analysis within each assay,
# evaluated on the voom normalized data
res.dl = dreamlet( res.proc, ~ StimStatus)
mashr
analysis# run mashr model to borrow information across genes and
# cell types in estimating coefficients' posterior distribution
res_mash = run_mash(res.dl, coef='StimStatusstim')
mashr
resultsCompute summary of mashr posterior distributions
library(mashr)
# extract statistics from mashr model
# NA values indicate genes not sufficiently expressed
# in a given cell type
# original logFC
head(res_mash$logFC.original)[1:4, 1:4]
## B cells CD14+ Monocytes CD4 T cells CD8 T cells
## A1BG NA NA -0.73718671 NA
## AAAS NA NA -0.56991157 NA
## AAED1 NA 1.426001 0.07140051 NA
## AAK1 NA NA -0.91972740 NA
# posterior mean for logFC
head(get_pm(res_mash$model))[1:4, 1:4]
## B cells CD14+ Monocytes CD4 T cells CD8 T cells
## A1BG NA NA -0.6327307 NA
## AAAS NA NA -0.4543872 NA
## AAED1 NA 1.378843 0.0201326 NA
## AAK1 NA NA -0.8578750 NA
# how many gene-by-celltype tests are significant
# i.e. if a gene is significant in 2 celltypes, it is counted twice
table(get_lfsr(res_mash$model) < 0.05, useNA="ifany")
##
## FALSE TRUE <NA>
## 8089 6073 30134
# how many genes are significant in at least one cell type
table( apply(get_lfsr(res_mash$model), 1, min, na.rm=TRUE) < 0.05)
##
## FALSE TRUE
## 2568 2969
# how many genes are significant in each cell type
apply(get_lfsr(res_mash$model), 2, function(x) sum(x < 0.05, na.rm=TRUE))
## B cells CD14+ Monocytes CD4 T cells CD8 T cells
## 767 2086 1525 412
## Dendritic cells FCGR3A+ Monocytes Megakaryocytes NK cells
## 52 566 36 629
# examine top set of genes
# which genes are significant in at least 1 cell type
sort(names(get_significant_results(res_mash$model)))[1:10]
## [1] "ACTB" "ACTG1_ENSG00000184009" "ARPC1B"
## [4] "ATP6V0E1" "B2M" "BTF3"
## [7] "BTG1" "CALM2" "CD74"
## [10] "CFL1"
# There is a lot of variation in the raw logFC
res_mash$logFC.original["ISG20",]
## B cells CD14+ Monocytes CD4 T cells CD8 T cells
## 3.200534 5.865638 3.060855 3.533391
## Dendritic cells FCGR3A+ Monocytes Megakaryocytes NK cells
## 3.593594 4.370017 NA 3.577744
# posterior mean after borrowing across cell type and genes
get_pm(res_mash$model)["ISG20",]
## B cells CD14+ Monocytes CD4 T cells CD8 T cells
## 3.201633 5.807546 3.063965 3.535864
## Dendritic cells FCGR3A+ Monocytes Megakaryocytes NK cells
## 3.601904 4.350143 NA 3.577692
Perform gene set analysis with zenith
using posterior mean for each coefficient
# gene set analysis using mashr results
library(zenith)
# Load Gene Ontology database
# use gene 'SYMBOL', or 'ENSEMBL' id
# use get_MSigDB() to load MSigDB
go.gs = get_GeneOntology("CC", to="SYMBOL")
# valid values for statistic:
# "tstatistic", "abs(tstatistic)", "logFC", "abs(logFC)"
df_gs = zenith_gsa(res_mash, go.gs)
# Heatmap of results
plotZenithResults(df_gs, 5, 1)
# forest plot based on mashr results
plotForest(res_mash, "ISG20")
Volcano plot based on local False Sign Rate (lFSR) estimated from the posterior distribution of each coefficient.
# volcano plot based on mashr results
# yaxis uses local false sign rate (lfsr)
plotVolcano(res_mash)