nnSVG 1.8.0
nnSVG
is a method for scalable identification of spatially variable genes (SVGs) in spatially-resolved transcriptomics data.
The nnSVG
method is based on nearest-neighbor Gaussian processes (Datta et al., 2016, Finley et al., 2019) and uses the BRISC algorithm (Saha and Datta, 2018) for model fitting and parameter estimation. nnSVG
allows identification and ranking of SVGs with flexible length scales across a tissue slide or within spatial domains defined by covariates. The method scales linearly with the number of spatial locations and can be applied to datasets containing thousands or more spatial locations.
nnSVG
is implemented as an R package within the Bioconductor framework, and is available from Bioconductor.
More details describing the method are available in our paper, available from Nature Communications.
The following code will install the latest release version of the nnSVG
package from Bioconductor. Additional details are shown on the Bioconductor page.
install.packages("BiocManager")
BiocManager::install("nnSVG")
The latest development version can also be installed from the devel
version of Bioconductor or from GitHub.
In the examples below, we assume the input data are provided as a SpatialExperiment Bioconductor object. In this case, the outputs are stored in the rowData
of the SpatialExperiment
object.
Alternatively, the inputs can also be provided as a numeric matrix of normalized and transformed counts (e.g. log-transformed normalized counts, also known as logcounts) and a numeric matrix of spatial coordinates.
Here we show a short example demonstrating how to run nnSVG
.
For faster runtime in this example, we subsample the dataset and run nnSVG
on only a small number of genes. For a full analysis, the subsampling step can be skipped.
library(SpatialExperiment)
library(STexampleData)
library(scran)
library(nnSVG)
library(ggplot2)
# load example dataset from STexampleData package
# see '?Visium_humanDLPFC' for more details
spe <- Visium_humanDLPFC()
## see ?STexampleData and browseVignettes('STexampleData') for documentation
## loading from cache
dim(spe)
## [1] 33538 4992
# preprocessing steps
# keep only spots over tissue
spe <- spe[, colData(spe)$in_tissue == 1]
dim(spe)
## [1] 33538 3639
# skip spot-level quality control, since this has been performed previously
# on this dataset
# filter low-expressed and mitochondrial genes
# using default filtering parameters
spe <- filter_genes(spe)
## Gene filtering: removing mitochondrial genes
## removed 13 mitochondrial genes
## Gene filtering: retaining genes with at least 3 counts in at least 0.5% (n = 19) of spatial locations
## removed 30216 out of 33525 genes due to low expression
# calculate logcounts (log-transformed normalized counts) using scran package
# using library size factors
spe <- computeLibraryFactors(spe)
spe <- logNormCounts(spe)
assayNames(spe)
## [1] "counts" "logcounts"
# select small set of random genes and several known SVGs for
# faster runtime in this example
set.seed(123)
ix_random <- sample(seq_len(nrow(spe)), 10)
known_genes <- c("MOBP", "PCP4", "SNAP25", "HBB", "IGKC", "NPY")
ix_known <- which(rowData(spe)$gene_name %in% known_genes)
ix <- c(ix_known, ix_random)
spe <- spe[ix, ]
dim(spe)
## [1] 16 3639
# run nnSVG
# set seed for reproducibility
set.seed(123)
# using a single thread in this example
spe <- nnSVG(spe)
## Warning in nnSVG(spe): Rows (genes) and/or columns (spots) containing all zero
## counts have been found. Please see examples in tutorial for code to filter out
## zeros and/or low-expressed genes to avoid errors.
# show results
rowData(spe)
## DataFrame with 16 rows and 17 columns
## gene_id gene_name feature_type sigma.sq
## <character> <character> <character> <numeric>
## ENSG00000211592 ENSG00000211592 IGKC Gene Expression 0.565654
## ENSG00000168314 ENSG00000168314 MOBP Gene Expression 1.387394
## ENSG00000122585 ENSG00000122585 NPY Gene Expression 0.285674
## ENSG00000244734 ENSG00000244734 HBB Gene Expression 0.329421
## ENSG00000132639 ENSG00000132639 SNAP25 Gene Expression 0.430040
## ... ... ... ... ...
## ENSG00000130382 ENSG00000130382 MLLT1 Gene Expression 0.00978555
## ENSG00000036672 ENSG00000036672 USP2 Gene Expression 0.00307277
## ENSG00000086232 ENSG00000086232 EIF2AK1 Gene Expression 0.00315782
## ENSG00000106278 ENSG00000106278 PTPRZ1 Gene Expression 0.00279851
## ENSG00000133606 ENSG00000133606 MKRN1 Gene Expression 0.00632245
## tau.sq phi loglik runtime mean var
## <numeric> <numeric> <numeric> <numeric> <numeric> <numeric>
## ENSG00000211592 0.455041 20.10688 -4531.64 3.880 0.622937 1.007454
## ENSG00000168314 0.364188 1.10202 -3663.60 1.914 0.805525 1.205673
## ENSG00000122585 0.280173 71.65329 -3995.23 2.836 0.393975 0.567383
## ENSG00000244734 0.353754 27.81410 -4044.96 4.221 0.411262 0.697673
## ENSG00000132639 0.430106 3.03385 -3912.70 0.839 3.451926 0.857922
## ... ... ... ... ... ... ...
## ENSG00000130382 0.283115 50.9880748 -2927.61 1.221 0.298698 0.292976
## ENSG00000036672 0.241105 12.5382833 -2597.00 0.838 0.248384 0.244218
## ENSG00000086232 0.266973 25.9302215 -2781.47 0.844 0.275193 0.270208
## ENSG00000106278 0.367893 9.5280046 -3357.32 1.047 0.352159 0.370784
## ENSG00000133606 0.272432 0.0827087 -2831.51 1.192 0.295404 0.278806
## spcov prop_sv loglik_lm LR_stat rank pval
## <numeric> <numeric> <numeric> <numeric> <numeric> <numeric>
## ENSG00000211592 1.207346 0.554185 -5176.53 1289.775 3 0
## ENSG00000168314 1.462248 0.792080 -5503.33 3679.464 1 0
## ENSG00000122585 1.356646 0.504861 -4131.87 273.278 6 0
## ENSG00000244734 1.395587 0.482191 -4507.99 926.046 4 0
## ENSG00000132639 0.189973 0.499961 -4884.19 1942.986 2 0
## ... ... ... ... ... ... ...
## ENSG00000130382 0.331177 0.03340915 -2929.28 3.35216 12 0.187106089
## ENSG00000036672 0.223173 0.01258414 -2598.08 2.15483 13 0.340473854
## ENSG00000086232 0.204200 0.01168999 -2782.09 1.23716 14 0.538708116
## ENSG00000106278 0.150219 0.00754943 -3357.83 1.01111 15 0.603169436
## ENSG00000133606 0.269170 0.02268111 -2839.08 15.15227 9 0.000512539
## padj
## <numeric>
## ENSG00000211592 0
## ENSG00000168314 0
## ENSG00000122585 0
## ENSG00000244734 0
## ENSG00000132639 0
## ... ...
## ENSG00000130382 0.24947479
## ENSG00000036672 0.41904474
## ENSG00000086232 0.61566642
## ENSG00000106278 0.64338073
## ENSG00000133606 0.00091118
The results are stored in the rowData
of the SpatialExperiment
object.
The main results of interest are:
LR_stat
: likelihood ratio (LR) statisticsrank
: rank of top SVGs according to LR statisticspval
: p-values from asymptotic chi-squared distribution with 2 degrees of freedompadj
: p-values adjusted for multiple testing, which can be used to define a cutoff for statistically significant SVGs (e.g. padj
<= 0.05)prop_sv
: effect size, defined as proportion of spatial variance out of total variance# number of significant SVGs
table(rowData(spe)$padj <= 0.05)
##
## FALSE TRUE
## 7 9
# show results for top n SVGs
rowData(spe)[order(rowData(spe)$rank)[1:10], ]
## DataFrame with 10 rows and 17 columns
## gene_id gene_name feature_type sigma.sq
## <character> <character> <character> <numeric>
## ENSG00000168314 ENSG00000168314 MOBP Gene Expression 1.38739399
## ENSG00000132639 ENSG00000132639 SNAP25 Gene Expression 0.43003959
## ENSG00000211592 ENSG00000211592 IGKC Gene Expression 0.56565436
## ENSG00000244734 ENSG00000244734 HBB Gene Expression 0.32942113
## ENSG00000183036 ENSG00000183036 PCP4 Gene Expression 0.23102221
## ENSG00000122585 ENSG00000122585 NPY Gene Expression 0.28567358
## ENSG00000129562 ENSG00000129562 DAD1 Gene Expression 0.02389606
## ENSG00000114923 ENSG00000114923 SLC4A3 Gene Expression 0.01147170
## ENSG00000133606 ENSG00000133606 MKRN1 Gene Expression 0.00632245
## ENSG00000143543 ENSG00000143543 JTB Gene Expression 0.07547797
## tau.sq phi loglik runtime mean var
## <numeric> <numeric> <numeric> <numeric> <numeric> <numeric>
## ENSG00000168314 0.364188 1.1020177 -3663.60 1.914 0.805525 1.205673
## ENSG00000132639 0.430106 3.0338473 -3912.70 0.839 3.451926 0.857922
## ENSG00000211592 0.455041 20.1068839 -4531.64 3.880 0.622937 1.007454
## ENSG00000244734 0.353754 27.8140976 -4044.96 4.221 0.411262 0.697673
## ENSG00000183036 0.452735 8.2722785 -4026.22 0.759 0.687961 0.684598
## ENSG00000122585 0.280173 71.6532892 -3995.23 2.836 0.393975 0.567383
## ENSG00000129562 0.464723 10.1418819 -3842.24 1.060 0.549318 0.489167
## ENSG00000114923 0.237260 12.7656826 -2617.36 1.220 0.250768 0.248816
## ENSG00000133606 0.272432 0.0827087 -2831.51 1.192 0.295404 0.278806
## ENSG00000143543 0.463561 119.7470905 -4036.28 1.366 0.654919 0.539172
## spcov prop_sv loglik_lm LR_stat rank pval
## <numeric> <numeric> <numeric> <numeric> <numeric> <numeric>
## ENSG00000168314 1.462248 0.7920804 -5503.33 3679.46397 1 0.00000e+00
## ENSG00000132639 0.189973 0.4999614 -4884.19 1942.98556 2 0.00000e+00
## ENSG00000211592 1.207346 0.5541853 -5176.53 1289.77508 3 0.00000e+00
## ENSG00000244734 1.395587 0.4821910 -4507.99 926.04573 4 0.00000e+00
## ENSG00000183036 0.698656 0.3378716 -4473.57 894.68884 5 0.00000e+00
## ENSG00000122585 1.356646 0.5048608 -4131.87 273.27818 6 0.00000e+00
## ENSG00000129562 0.281410 0.0489053 -3861.98 39.49098 7 2.65854e-09
## ENSG00000114923 0.427112 0.0461207 -2632.02 29.31376 8 4.31119e-07
## ENSG00000133606 0.269170 0.0226811 -2839.08 15.15227 9 5.12539e-04
## ENSG00000143543 0.419491 0.1400231 -4039.07 5.59669 10 6.09108e-02
## padj
## <numeric>
## ENSG00000168314 0.00000e+00
## ENSG00000132639 0.00000e+00
## ENSG00000211592 0.00000e+00
## ENSG00000244734 0.00000e+00
## ENSG00000183036 0.00000e+00
## ENSG00000122585 0.00000e+00
## ENSG00000129562 6.07667e-09
## ENSG00000114923 8.62238e-07
## ENSG00000133606 9.11180e-04
## ENSG00000143543 9.74572e-02
# plot spatial expression of top-ranked SVG
ix <- which(rowData(spe)$rank == 1)
ix_name <- rowData(spe)$gene_name[ix]
ix_name
## [1] "MOBP"
df <- as.data.frame(
cbind(spatialCoords(spe),
expr = counts(spe)[ix, ]))
ggplot(df, aes(x = pxl_col_in_fullres, y = pxl_row_in_fullres, color = expr)) +
geom_point(size = 0.8) +
coord_fixed() +
scale_y_reverse() +
scale_color_gradient(low = "gray90", high = "blue",
trans = "sqrt", breaks = range(df$expr),
name = "counts") +
ggtitle(ix_name) +
theme_bw() +
theme(plot.title = element_text(face = "italic"),
panel.grid = element_blank(),
axis.title = element_blank(),
axis.text = element_blank(),
axis.ticks = element_blank())