Contents

1 Overview

scrapper implements bindings to C++ code for analyzing single-cell data, mostly from the libscran libraries. Each function performs an individual analysis step ranging from quality control to clustering and marker detection. scrapper is mostly intended for other Bioconductor package developers to build more user-friendly end-to-end workflows.

2 Quick start

Let’s fetch a small single-cell RNA-seq dataset for demonstration purposes:

library(scRNAseq)
sce.z <- ZeiselBrainData()
sce.z
## class: SingleCellExperiment 
## dim: 20006 3005 
## metadata(0):
## assays(1): counts
## rownames(20006): Tspan12 Tshz1 ... mt-Rnr1 mt-Nd4l
## rowData names(1): featureType
## colnames(3005): 1772071015_C02 1772071017_G12 ... 1772066098_A12
##   1772058148_F03
## colData names(9): tissue group # ... level1class level2class
## reducedDimNames(0):
## mainExpName: gene
## altExpNames(2): repeat ERCC

We run it through all the scrapper functions.

# Wrapping it in a DelayedArray to avoid making unnecessary copies when we
# do the various subsetting steps.
library(DelayedArray)
counts <- DelayedArray(assay(sce.z))

# We can set the number of threads higher in real applications, but
# we want to avoid stressing out Bioconductor's build system.
nthreads <- 2

library(scrapper)
is.mito <- grepl("^mt-", rownames(counts))
rna.qc.metrics <- computeRnaQcMetrics(counts, subsets=list(mt=is.mito), num.threads=nthreads)
rna.qc.thresholds <- suggestRnaQcThresholds(rna.qc.metrics)
rna.qc.filter <- filterRnaQcMetrics(rna.qc.thresholds, rna.qc.metrics)

filtered <- counts[,rna.qc.filter,drop=FALSE]
size.factors <- centerSizeFactors(rna.qc.metrics$sum[rna.qc.filter])
normalized <- normalizeCounts(filtered, size.factors)

gene.var <- modelGeneVariances(normalized, num.threads=nthreads)
top.hvgs <- chooseHighlyVariableGenes(gene.var$statistics$residuals)
pca <- runPca(normalized[top.hvgs,], num.threads=nthreads)

umap.out <- runUmap(pca$components, num.threads=nthreads)
tsne.out <- runTsne(pca$components, num.threads=nthreads)
snn.graph <- buildSnnGraph(pca$components, num.threads=nthreads)
clust.out <- clusterGraph(snn.graph)

markers <- scoreMarkers(normalized, groups=clust.out$membership, num.threads=nthreads)

Now we can have a look at some of the results. For example, we can make a t-SNE plot:

plot(tsne.out[,1], tsne.out[,2], pch=clust.out$membership)

We can also have a look at the top markers for each cluster, say, based on the median AUC:

top.markers <- lapply(markers$auc, function(x) {
    head(rownames(x)[order(x$median, decreasing=TRUE)], 10)
})
head(top.markers)
## $`1`
##  [1] "Gad1"   "Gad2"   "Ndrg4"  "Stmn3"  "Tspyl4" "Scg5"   "Syp"    "Snap25"
##  [9] "Nap1l5" "Rab3a" 
## 
## $`2`
##  [1] "Plp1"   "Mbp"    "Cnp"    "Mobp"   "Mog"    "Sept4"  "Ugt8a"  "Trf"   
##  [9] "Gsn"    "Cldn11"
## 
## $`3`
##  [1] "Stmn3"  "Calm1"  "Scg5"   "Chn1"   "Snap25" "Pcsk2"  "Calm2"  "Mdh1"  
##  [9] "Nme1"   "Stmn2" 
## 
## $`4`
##  [1] "Chn1"          "Stmn3"         "Hpca"          "Calm2"        
##  [5] "Neurod6"       "3110035E14Rik" "Crym"          "Calm1"        
##  [9] "Ppp3r1"        "Ppp3ca"       
## 
## $`5`
##  [1] "Ywhah"  "Syp"    "Snap25" "Ndrg4"  "Vsnl1"  "Rab3a"  "Chn1"   "Stmn3" 
##  [9] "Uchl1"  "Ppp3r1"
## 
## $`6`
##  [1] "Atp1b1"        "Ppp3ca"        "Nsg2"          "Tspan13"      
##  [5] "Syp"           "Crym"          "Chn1"          "3110035E14Rik"
##  [9] "Thy1"          "Gria1"

3 Dealing with multiple batches

Let’s fetch another brain dataset and combine it with the previous one.

sce.t <- TasicBrainData()
common <- intersect(rownames(sce.z), rownames(sce.t))
combined <- cbind(
    DelayedArray(assay(sce.t))[common,],
    DelayedArray(assay(sce.z))[common,]
)
block <- rep(c("tasic", "zeisel"), c(ncol(sce.t), ncol(sce.z)))

We can now perform the analysis while blocking on the dataset of origin.

# No mitochondrial genes in the combined set, so we'll just skip it.
library(scrapper)
rna.qc.metrics <- computeRnaQcMetrics(combined, subsets=list(), num.threads=nthreads)
rna.qc.thresholds <- suggestRnaQcThresholds(rna.qc.metrics, block=block)
rna.qc.filter <- filterRnaQcMetrics(rna.qc.thresholds, rna.qc.metrics, block=block)

filtered <- combined[,rna.qc.filter,drop=FALSE]
filtered.block <- block[rna.qc.filter]
size.factors <- centerSizeFactors(rna.qc.metrics$sum[rna.qc.filter], block=filtered.block)
normalized <- normalizeCounts(filtered, size.factors)

gene.var <- modelGeneVariances(normalized, num.threads=nthreads, block=filtered.block)
top.hvgs <- chooseHighlyVariableGenes(gene.var$statistics$residuals)
pca <- runPca(normalized[top.hvgs,], num.threads=nthreads, block=filtered.block)

# Now we do a MNN correction to get rid of the batch effect.
corrected <- correctMnn(pca$components, block=filtered.block, num.threads=nthreads)

umap.out <- runUmap(corrected$corrected, num.threads=nthreads)
tsne.out <- runTsne(corrected$corrected, num.threads=nthreads)
snn.graph <- buildSnnGraph(corrected$corrected, num.threads=nthreads)
clust.out <- clusterGraph(snn.graph)

markers <- scoreMarkers(normalized, groups=clust.out$membership, block=filtered.block, num.threads=nthreads)

We then check whether the datasets were suitably merged together.

plot(tsne.out[,1], tsne.out[,2], pch=16, col=factor(filtered.block))