Contents

Author: Martin Morgan
Date: 26 July, 2019

1 Motivating example

Attach the [TENxBrainData][] experiment data package

library(TENxBrainData)

Load a very large SummarizedExperiment

tenx <- TENxBrainData()
## snapshotDate(): 2019-07-10
## see ?TENxBrainData and browseVignettes('TENxBrainData') for documentation
## downloading 0 resources
## loading from cache
tenx
## class: SingleCellExperiment 
## dim: 27998 1306127 
## metadata(0):
## assays(1): counts
## rownames: NULL
## rowData names(2): Ensembl Symbol
## colnames(1306127): AAACCTGAGATAGGAG-1 AAACCTGAGCGGCTTC-1 ...
##   TTTGTCAGTTAAAGTG-133 TTTGTCATCTGAAAGA-133
## colData names(4): Barcode Sequence Library Mouse
## reducedDimNames(0):
## spikeNames(0):
assay(tenx)
## <27998 x 1306127> DelayedMatrix object of type "integer":
##            AAACCTGAGATAGGAG-1 ... TTTGTCATCTGAAAGA-133
##     [1,]                    0   .                    0
##     [2,]                    0   .                    0
##     [3,]                    0   .                    0
##     [4,]                    0   .                    0
##     [5,]                    0   .                    0
##      ...                    .   .                    .
## [27994,]                    0   .                    0
## [27995,]                    1   .                    0
## [27996,]                    0   .                    0
## [27997,]                    0   .                    0
## [27998,]                    0   .                    0

Quickly perform basic operations

log1p(assay(tenx))
## <27998 x 1306127> DelayedMatrix object of type "double":
##            AAACCTGAGATAGGAG-1 ... TTTGTCATCTGAAAGA-133
##     [1,]                    0   .                    0
##     [2,]                    0   .                    0
##     [3,]                    0   .                    0
##     [4,]                    0   .                    0
##     [5,]                    0   .                    0
##      ...                    .   .                    .
## [27994,]            0.0000000   .                    0
## [27995,]            0.6931472   .                    0
## [27996,]            0.0000000   .                    0
## [27997,]            0.0000000   .                    0
## [27998,]            0.0000000   .                    0

Subset and summarize ‘library size’ for 1000 cells

tenx_subset <- tenx[,1:1000]
lib_size <- colSums(assay(tenx_subset))
hist(log10(lib_size))

The data is sparse, with more than 92% of cells equal to 0

sum(assay(tenx_subset) == 0) / prod(dim(tenx_subset))
## [1] 0.9276541

2 Write or use efficient R code

Avoid unnecessary copying

n <- 50000

## makes a copy of `res1` on each iteration
set.seed(123)
system.time({
    res1 <- NULL
    for (i in 1:n)
        res1 <- c(res1, rnorm(1))
})
##    user  system elapsed 
##   7.098   2.908  10.013
## pre-allocation
set.seed(123)
system.time({
    res2 <- numeric(n)
    for (i in 1:n)
        res2[i] <- rnorm(1)
})
##    user  system elapsed 
##   0.078   0.004   0.082
identical(res1, res2)
## [1] TRUE
## no need to think about allocation!
set.seed(123)
system.time({
    res3 <- sapply(1:n, function(i) rnorm(1))
})
##    user  system elapsed 
##   0.094   0.002   0.096
identical(res1, res3)
## [1] TRUE

Vectorize your own scripts

n <- 2000000

## iteration: n calls to `rnorm(1)`
set.seed(123)
system.time({
    res1 <- sapply(1:n, function(i) rnorm(1))
})
##    user  system elapsed 
##   5.678   0.693   6.374
## vectorize: 1 call to `rnorm(n)`
set.seed(123)
system.time( res2 <- rnorm(n) )
##    user  system elapsed 
##   0.137   0.000   0.136
identical(res1, res2)
## [1] TRUE

Reuse other people’s efficient code.

Examples in the lab this afternoon, and from Tuesday evening

3 Use chunk-wise iteration

Example: nucleotide frequency of mapped reads

3.1 An example without chunking

Working through example data; not really large…

library(RNAseqData.HNRNPC.bam.chr14)
fname <- RNAseqData.HNRNPC.bam.chr14_BAMFILES[1]
basename(fname)
## [1] "ERR127306_chr14.bam"
Rsamtools::countBam(fname)
##   space start end width                file records nucleotides
## 1    NA    NA  NA    NA ERR127306_chr14.bam  800484    57634848

Input into a GAlignments object, including seq (sequence) of each read.

library(GenomicAlignments)
param <- ScanBamParam(what = "seq")
galn <- readGAlignments(fname, param = param)

Write a function to determine GC content of reads

library(Biostrings)
gc_content <- function(galn) {
    seq <- mcols(galn)[["seq"]]
    gc <- letterFrequency(seq, "GC", as.prob=TRUE)
    as.vector(gc)
}

Calculate and display GC content

param <- ScanBamParam(what = "seq")
galn <- readGAlignments(fname, param = param)
res1 <- gc_content(galn)
hist(res1)

3.2 The same example with chunking

Open file for reading, specifying ‘yield’ size

bfl <- BamFile(fname, yieldSize = 100000)
open(bfl)

Repeatedly read chunks of data and calculate GC content

res2 <- numeric()
repeat {
    message(".")
    galn <- readGAlignments(bfl, param = param)
    if (length(galn) == 0)
        break
    ## inefficient copy of res2, but only a few iterations...
    res2 <- c(res2, gc_content(galn))
}
## .
## .
## .
## .
## .
## .
## .
## .
## .
## .

Clean up and compare approaches

close(bfl)
identical(res1, res2)
## [1] TRUE

4 Use (classical) parallel evaluation

Many down-sides

Maximum speed-up

4.1 BiocParallel

fun <- function(i) {
    Sys.sleep(1)    # a time-consuming calculation
    i               # and then the result
}

system.time({
    res1 <- lapply(1:10, fun)
})
##    user  system elapsed 
##   0.003   0.000  10.036
library(BiocParallel)
system.time({
    res2 <- bplapply(1:10, fun)
})
##    user  system elapsed 
##   2.034   0.062   3.170
identical(res1, res2)
## [1] TRUE
  • ‘Forked’ processes (non-Windows)
    • No need to distribute data from main thread to workers
  • Independent processes
  • Classic clusters, e.g., slurm
  • Coming: cloud-based solutions

4.2 GenomicFiles

Parallel, chunk-wise iteration through genomic files. Set up:

library(GenomicFiles)

Define a yield function that provides a chunk of data for processing

yield <- function(x) {
    param <- ScanBamParam(what = "seq")
    readGAlignments(x, param = param)
}

Define a map function that transforms the input data to the desired result

map <- function(x) {
    seq <- mcols(x)[["seq"]]
    gc <- letterFrequency(seq, "GC", as.prob = TRUE)
    as.vector(gc)
}

Define a reduce function that combines two successive results

reduce <- c

Perform the calculation, chunk-wise and in parallel

library(RNAseqData.HNRNPC.bam.chr14)
fname <- RNAseqData.HNRNPC.bam.chr14_BAMFILES[1]
bfl <- BamFile(fname, yieldSize = 100000)

res <- reduceByYield(bfl, yield, map, reduce, parallel = TRUE)
hist(res)

5 Query specific values

6 Provenance

sessionInfo()
## R version 3.6.1 Patched (2019-07-16 r76845)
## Platform: x86_64-apple-darwin17.7.0 (64-bit)
## Running under: macOS High Sierra 10.13.6
## 
## Matrix products: default
## BLAS:   /Users/ma38727/bin/R-3-6-branch/lib/libRblas.dylib
## LAPACK: /Users/ma38727/bin/R-3-6-branch/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] parallel  stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] GenomicFiles_1.21.0                rtracklayer_1.45.2                
##  [3] GenomicAlignments_1.21.4           Rsamtools_2.1.3                   
##  [5] Biostrings_2.53.2                  XVector_0.25.0                    
##  [7] RNAseqData.HNRNPC.bam.chr14_0.23.0 TENxBrainData_1.5.0               
##  [9] HDF5Array_1.13.4                   rhdf5_2.29.0                      
## [11] SingleCellExperiment_1.7.0         SummarizedExperiment_1.15.5       
## [13] DelayedArray_0.11.4                BiocParallel_1.19.0               
## [15] matrixStats_0.54.0                 Biobase_2.45.0                    
## [17] GenomicRanges_1.37.14              GenomeInfoDb_1.21.1               
## [19] IRanges_2.19.10                    S4Vectors_0.23.17                 
## [21] BiocGenerics_0.31.5                BiocStyle_2.13.2                  
## 
## loaded via a namespace (and not attached):
##  [1] httr_1.4.0                    bit64_0.9-7                  
##  [3] AnnotationHub_2.17.6          shiny_1.3.2                  
##  [5] assertthat_0.2.1              askpass_1.1                  
##  [7] interactiveDisplayBase_1.23.0 BiocManager_1.30.4           
##  [9] BiocFileCache_1.9.1           blob_1.2.0                   
## [11] BSgenome_1.53.0               GenomeInfoDbData_1.2.1       
## [13] progress_1.2.2                yaml_2.2.0                   
## [15] pillar_1.4.2                  RSQLite_2.1.2                
## [17] backports_1.1.4               lattice_0.20-38              
## [19] glue_1.3.1                    digest_0.6.20                
## [21] promises_1.0.1                htmltools_0.3.6              
## [23] httpuv_1.5.1                  Matrix_1.2-17                
## [25] XML_3.98-1.20                 pkgconfig_2.0.2              
## [27] biomaRt_2.41.7                bookdown_0.12                
## [29] zlibbioc_1.31.0               purrr_0.3.2                  
## [31] xtable_1.8-4                  later_0.8.0                  
## [33] openssl_1.4.1                 tibble_2.1.3                 
## [35] GenomicFeatures_1.37.4        magrittr_1.5                 
## [37] crayon_1.3.4                  mime_0.7                     
## [39] memoise_1.1.0                 evaluate_0.14                
## [41] prettyunits_1.0.2             tools_3.6.1                  
## [43] hms_0.5.0                     stringr_1.4.0                
## [45] Rhdf5lib_1.7.3                AnnotationDbi_1.47.0         
## [47] compiler_3.6.1                rlang_0.4.0                  
## [49] grid_3.6.1                    RCurl_1.95-4.12              
## [51] VariantAnnotation_1.31.3      rappdirs_0.3.1               
## [53] bitops_1.0-6                  rmarkdown_1.14               
## [55] ExperimentHub_1.11.3          codetools_0.2-16             
## [57] DBI_1.0.0                     curl_4.0                     
## [59] R6_2.4.0                      knitr_1.23                   
## [61] dplyr_0.8.3                   bit_1.1-14                   
## [63] zeallot_0.1.0                 stringi_1.4.3                
## [65] Rcpp_1.0.1                    vctrs_0.2.0                  
## [67] dbplyr_1.4.2                  tidyselect_0.2.5             
## [69] xfun_0.8