COTAN 2.7.1
library(COTAN)
library(zeallot)
library(rlang)
library(data.table)
library(Rtsne)
library(qpdf)
library(GEOquery)
library(ComplexHeatmap)
options(parallelly.fork.enable = TRUE)
This tutorial contains the same functionalities as the first release of the COTAN tutorial but done using the new and updated functions.
Download the data-set for "mouse cortex E17.5"
.
dataDir <- tempdir()
GEO <- "GSM2861514"
fName <- "GSM2861514_E175_Only_Cortical_Cells_DGE.txt.gz"
dataSetFile <- file.path(dataDir, GEO, fName)
dir.create(file.path(dataDir, GEO), showWarnings = FALSE)
if (!file.exists(dataSetFile)) {
getGEOSuppFiles(GEO, makeDirectory = TRUE,
baseDir = dataDir, fetch_files = TRUE,
filter_regex = fName)
}
#> size
#> /tmp/RtmpWBR8Yp/GSM2861514/GSM2861514_E175_Only_Cortical_Cells_DGE.txt.gz 1509523
#> isdir
#> /tmp/RtmpWBR8Yp/GSM2861514/GSM2861514_E175_Only_Cortical_Cells_DGE.txt.gz FALSE
#> mode
#> /tmp/RtmpWBR8Yp/GSM2861514/GSM2861514_E175_Only_Cortical_Cells_DGE.txt.gz 644
#> mtime
#> /tmp/RtmpWBR8Yp/GSM2861514/GSM2861514_E175_Only_Cortical_Cells_DGE.txt.gz 2024-11-13 16:40:55
#> ctime
#> /tmp/RtmpWBR8Yp/GSM2861514/GSM2861514_E175_Only_Cortical_Cells_DGE.txt.gz 2024-11-13 16:40:55
#> atime
#> /tmp/RtmpWBR8Yp/GSM2861514/GSM2861514_E175_Only_Cortical_Cells_DGE.txt.gz 2024-11-13 16:40:54
#> uid
#> /tmp/RtmpWBR8Yp/GSM2861514/GSM2861514_E175_Only_Cortical_Cells_DGE.txt.gz 1001
#> gid
#> /tmp/RtmpWBR8Yp/GSM2861514/GSM2861514_E175_Only_Cortical_Cells_DGE.txt.gz 1001
#> uname
#> /tmp/RtmpWBR8Yp/GSM2861514/GSM2861514_E175_Only_Cortical_Cells_DGE.txt.gz biocbuild
#> grname
#> /tmp/RtmpWBR8Yp/GSM2861514/GSM2861514_E175_Only_Cortical_Cells_DGE.txt.gz biocbuild
sample.dataset <- read.csv(dataSetFile, sep = "\t", row.names = 1L)
Define a directory where the output will be stored.
outDir <- dataDir
# Log-level 2 was chosen to showcase better how the package works
# In normal usage a level of 0 or 1 is more appropriate
setLoggingLevel(2L)
#> Setting new log level to 2
# This file will contain all the logs produced by the package
# as if at the highest logging level
setLoggingFile(file.path(outDir, "vignette_v2.log"))
#> Setting log file to be: /tmp/RtmpWBR8Yp/vignette_v2.log
message("COTAN uses the `torch` library when asked to `optimizeForSpeed`")
#> COTAN uses the `torch` library when asked to `optimizeForSpeed`
message("Run the command 'options(COTAN.UseTorch = FALSE)'",
" in your session to disable `torch` completely!")
#> Run the command 'options(COTAN.UseTorch = FALSE)' in your session to disable `torch` completely!
# this command does check whether the torch library is properly installed
c(useTorch, deviceStr) %<-% COTAN:::canUseTorch(TRUE, "cuda")
#> While trying to load the `torch` library Error in doTryCatch(return(expr), name, parentenv, handler): The `torch` library is installed but the required additional libraries are not avalable yet
#> Warning in value[[3L]](cond): The `torch` library is installed, but might
#> require further initialization
#> Warning in value[[3L]](cond): Please look at the `torch` package installation
#> guide to complete the installation
#> Warning in COTAN:::canUseTorch(TRUE, "cuda"): Falling back to legacy
#> [non-torch] code.
if (useTorch) {
message("The `torch` library is available and ready to use")
if (deviceStr == "cuda") {
message("The `torch` library can use the `CUDA` GPU")
} else {
message("The `torch` library can only use the CPU")
message("Please ensure you have the `OpenBLAS` libraries",
" installed on the system")
}
}
rm(useTorch, deviceStr)
Initialize the COTAN
object with the row count table and
the metadata for the experiment.
cond <- "mouse_cortex_E17.5"
obj <- COTAN(raw = sample.dataset)
obj <- initializeMetaDataset(obj,
GEO = GEO,
sequencingMethod = "Drop_seq",
sampleCondition = cond)
#> Initializing `COTAN` meta-data
logThis(paste0("Condition ", getMetadataElement(obj, datasetTags()[["cond"]])),
logLevel = 1L)
#> Condition mouse_cortex_E17.5
Before we proceed to the analysis, we need to clean the data. The analysis will use a matrix of raw UMI counts as the input. To obtain this matrix, we have to remove any potential cell doublets or multiplets, as well as any low quality or dying cells.
We can check the library size (UMI number) with an empirical cumulative distribution function
plot(ECDPlot(obj))
plot(cellSizePlot(obj))
plot(genesSizePlot(obj))
#> Warning: Removed 1 row containing missing values or values outside the scale range
#> (`geom_point()`).
plot(scatterPlot(obj))
During the cleaning, every time we want to remove cells or genes
we can use the dropGenesCells()
function.
Drop cells with too many reads as they are probably doublets or multiplets
cellsSizeThr <- 6000L
obj <- addElementToMetaDataset(obj, "Cells size threshold", cellsSizeThr)
cells_to_rem <- getCells(obj)[getCellsSize(obj) > cellsSizeThr]
obj <- dropGenesCells(obj, cells = cells_to_rem)
plot(cellSizePlot(obj))
To drop cells by gene number: high genes count might also indicate doublets…
genesSizeHighThr <- 3000L
obj <- addElementToMetaDataset(obj, "Num genes high threshold", genesSizeHighThr)
cells_to_rem <- getCells(obj)[getNumExpressedGenes(obj) > genesSizeHighThr]
obj <- dropGenesCells(obj, cells = cells_to_rem)
plot(genesSizePlot(obj))
#> Warning: Removed 1 row containing missing values or values outside the scale range
#> (`geom_point()`).
Drop cells with too low genes expression as they are probably dead
genesSizeLowThr <- 500L
obj <- addElementToMetaDataset(obj, "Num genes low threshold", genesSizeLowThr)
cells_to_rem <- getCells(obj)[getNumExpressedGenes(obj) < genesSizeLowThr]
obj <- dropGenesCells(obj, cells = cells_to_rem)
plot(genesSizePlot(obj))
Cells with a too high percentage of mitochondrial genes are likely dead (or at the last problematic) cells. So we drop them!
c(mitPlot, mitSizes) %<-% mitochondrialPercentagePlot(obj, genePrefix = "^Mt")
plot(mitPlot)
mitPercThr <- 1.0
obj <- addElementToMetaDataset(obj, "Mitoc. perc. threshold", mitPercThr)
cells_to_rem <- rownames(mitSizes)[mitSizes[["mit.percentage"]] > mitPercThr]
obj <- dropGenesCells(obj, cells = cells_to_rem)
c(mitPlot, mitSizes) %<-% mitochondrialPercentagePlot(obj, genePrefix = "^Mt")
plot(mitPlot)
If we do not want to consider the mitochondrial genes we can remove them before starting the analysis.
genes_to_rem <- getGenes(obj)[grep("^Mt", getGenes(obj))]
cells_to_rem <- getCells(obj)[which(getCellsSize(obj) == 0L)]
obj <- dropGenesCells(obj, genes_to_rem, cells_to_rem)
We want also to log the current status.
logThis(paste("n cells", getNumCells(obj)), logLevel = 1L)
#> n cells 859
The clean()
function estimates all the parameters for the data. Therefore, we have to run it again every time we remove any genes or cells from the data.
obj <- addElementToMetaDataset(obj, "Num drop B group", 0)
obj <- clean(obj)
#> Genes/cells selection done: dropped [4787] genes and [0] cells
#> Working on [12235] genes and [859] cells
c(pcaCellsPlot, pcaCellsData, genesPlot,
UDEPlot, nuPlot, zoomedNuPlot) %<-% cleanPlots(obj)
#> PCA: START
#> PCA: DONE
#> Hierarchical clustering: START
#> Hierarchical clustering: DONE
plot(pcaCellsPlot)
plot(genesPlot)
We can observe here that the red cells are really enriched in hemoglobin genes so we prefer to remove them. They can be extracted from the pcaCellsData
object and removed.
cells_to_rem <- rownames(pcaCellsData)[pcaCellsData[["groups"]] == "B"]
obj <- dropGenesCells(obj, cells = cells_to_rem)
obj <- addElementToMetaDataset(obj, "Num drop B group", 1)
obj <- clean(obj)
#> Genes/cells selection done: dropped [5] genes and [0] cells
#> Working on [12230] genes and [855] cells
c(pcaCellsPlot, pcaCellsData, genesPlot,
UDEPlot, nuPlot, zoomedNuPlot) %<-% cleanPlots(obj)
#> PCA: START
#> PCA: DONE
#> Hierarchical clustering: START
#> Hierarchical clustering: DONE
plot(pcaCellsPlot)
To color the PCA based on nu
(so the cells’ efficiency)
plot(UDEPlot)
UDE (color) should not correlate with principal components! This is very important.
The next part is used to remove the cells with efficiency too low.
plot(nuPlot)
We can zoom on the smallest values and, if COTAN detects a clear elbow, we can decide to remove the cells.
plot(zoomedNuPlot)
We also save the defined threshold in the metadata and re-run the estimation
UDELowThr <- 0.30
obj <- addElementToMetaDataset(obj, "Low UDE cells' threshold", UDELowThr)
obj <- addElementToMetaDataset(obj, "Num drop B group", 2)
cells_to_rem <- getCells(obj)[getNu(obj) < UDELowThr]
obj <- dropGenesCells(obj, cells = cells_to_rem)
Repeat the estimation after the cells are removed
obj <- clean(obj)
#> Genes/cells selection done: dropped [3] genes and [0] cells
#> Working on [12227] genes and [852] cells
c(pcaCellsPlot, pcaCellsData, genesPlot,
UDEPlot, nuPlot, zoomedNuPlot) %<-% cleanPlots(obj)
#> PCA: START
#> PCA: DONE
#> Hierarchical clustering: START
#> Hierarchical clustering: DONE
plot(pcaCellsPlot)
plot(scatterPlot(obj))
logThis(paste("n cells", getNumCells(obj)), logLevel = 1L)
#> n cells 852
In this part, all the contingency tables are computed
and used to get the statistics necessary to COEX
evaluation and storing
obj <- proceedToCoex(obj, calcCoex = TRUE,
optimizeForSpeed = TRUE, cores = 6L, deviceStr = "cuda",
saveObj = FALSE, outDir = outDir)
#> Cotan analysis functions started
#> Genes/cells selection done: dropped [0] genes and [0] cells
#> Working on [12227] genes and [852] cells
#> Estimate dispersion: START
#> Estimate dispersion: DONE
#> dispersion | min: -0.056854248046875 | max: 372.75 | % negative: 19.5469043919195
#> Cotan genes' coex estimation started
#> While trying to load the `torch` library Error in doTryCatch(return(expr), name, parentenv, handler): The `torch` library is installed but the required additional libraries are not avalable yet
#> Warning in canUseTorch(optimizeForSpeed, deviceStr): Falling back to legacy
#> [non-torch] code.
#> Calculate genes' coex (legacy): START
#> Warning in asMethod(object): sparse->dense coercion: allocating vector of size
#> 1.1 GiB
#> Total calculations elapsed time: 166.998006343842
#> Calculate genes' coex (legacy): DONE
When saveObj == TRUE
, in the previous step, this step can be skipped
as the COTAN
object has already been saved in the outDir
.
# saving the structure
saveRDS(obj, file = file.path(outDir, paste0(cond, ".cotan.RDS")))
It is also possible to run directly a single function if we don’t want to clean anything.
obj2 <- automaticCOTANObjectCreation(
raw = sample.dataset,
GEO = GEO,
sequencingMethod = "Drop_seq",
sampleCondition = cond,
calcCoex = TRUE, cores = 6L, optimizeForSpeed = TRUE,
saveObj = TRUE, outDir = outDir)
To calculate and store the Global Differentiation Index (GDI
) we can run:
gdiDF <- calculateGDI(obj)
#> Calculate GDI dataframe: START
#> Calculate GDI dataframe: DONE
head(gdiDF)
#> sum.raw.norm GDI exp.cells
#> 0610007N19Rik 3.332671 1.626640 2.230047
#> 0610007P14Rik 5.279001 1.349530 18.779343
#> 0610009B22Rik 4.733316 1.281530 11.619718
#> 0610009D07Rik 6.313149 1.364269 43.427230
#> 0610009E02Rik 2.590242 1.016320 1.173709
#> 0610009L18Rik 3.509489 1.256757 3.051643
# This will store only the $GDI column
obj <- storeGDI(obj, genesGDI = gdiDF)
The next function can either plot the GDI
for the dataset directly or
use the pre-computed dataframe.
It marks the given threshold 1.43 (in red) and
the 50% and 75% quantiles (in blue).
We can also specify some gene sets (three in this case) that
we want to label explicitly in the GDI
plot.
genesList <- list(
"NPGs" = c("Nes", "Vim", "Sox2", "Sox1", "Notch1", "Hes1", "Hes5", "Pax6"),
"PNGs" = c("Map2", "Tubb3", "Neurod1", "Nefm", "Nefl", "Dcx", "Tbr1"),
"hk" = c("Calm1", "Cox6b1", "Ppia", "Rpl18", "Cox7c", "Erh", "H3f3a",
"Taf1", "Taf2", "Gapdh", "Actb", "Golph3", "Zfr", "Sub1",
"Tars", "Amacr")
)
GDIPlot(obj, cond = cond, genes = genesList, GDIThreshold = 1.40)
#> GDI plot
#> Removed 0 low GDI genes (such as the fully-expressed) in GDI plot
The percentage of cells expressing the gene in the third column of this data-frame is reported.
To perform the Gene Pair Analysis, we can plot a heatmap of the COEX
values
between two gene sets.
We have to define the different gene sets (list.genes
) in a list.
Then we can choose which sets to use in the function parameter sets
(for example, from 1 to 3).
We also have to provide an array of the file name prefixes for each condition
(for example, “mouse_cortex_E17.5”).
In fact, this function can plot genes relationships across many different
conditions to get a complete overview.
plot(heatmapPlot(obj, genesLists = genesList))
#> heatmap plot: START
#> Hangling COTAN object with condition: mouse_cortex_E17.5
#> calculating PValues: START
#> Get p-values on a set of genes on columns and on a set of genes on rows
#> calculating PValues: DONE
#> min coex: -0.475089564305104 max coex: 0.456662313361164
We can also plot a general heatmap of COEX
values based on some markers like
the following one.
plot(genesHeatmapPlot(obj, primaryMarkers = c("Satb2", "Bcl11b", "Vim", "Hes1"),
pValueThreshold = 0.001, symmetric = TRUE))
#> calculating PValues: START
#> Get p-values genome wide on columns and genome wide on rows
#> calculating PValues: DONE
plot(genesHeatmapPlot(obj, primaryMarkers = c("Satb2", "Bcl11b", "Fezf2"),
secondaryMarkers = c("Gabra3", "Meg3", "Cux1", "Neurod6"),
pValueThreshold = 0.001, symmetric = FALSE))
Sometimes we can also be interested in the numbers present directly in the contingency tables for two specific genes. To get them we can use two functions:
contingencyTables()
to produce the observed and expected data
print("Contingency Tables:")
#> [1] "Contingency Tables:"
contingencyTables(obj, g1 = "Satb2", g2 = "Bcl11b")
#> $observed
#> Satb2.yes Satb2.no
#> Bcl11b.yes 47 149
#> Bcl11b.no 287 369
#>
#> $expected
#> Satb2.yes Satb2.no
#> Bcl11b.yes 82.78537 113.2154
#> Bcl11b.no 251.21413 404.7851
print("Corresponding Coex")
#> [1] "Corresponding Coex"
getGenesCoex(obj)["Satb2", "Bcl11b"]
#> [1] -0.2028008
Another useful function is getGenesCoex()
. This can be used to extract
the whole or a partial COEX
matrix from a COTAN
object.
# For the whole matrix
coex <- getGenesCoex(obj, zeroDiagonal = FALSE)
coex[1000L:1005L, 1000L:1005L]
#> 6 x 6 Matrix of class "dspMatrix"
#> Ap3s1 Ap3s2 Ap4b1 Ap4e1 Ap4m1 Ap4s1
#> Ap3s1 0.91872636 -0.02997150 0.028065790 -0.02865879 0.01372465 -0.032402737
#> Ap3s2 -0.02997150 0.91460404 -0.045830195 -0.03058359 -0.05045147 0.095316128
#> Ap4b1 0.02806579 -0.04583020 0.907981121 -0.03650627 0.02944754 0.009052527
#> Ap4e1 -0.02865879 -0.03058359 -0.036506269 0.82865653 -0.04320115 -0.025651002
#> Ap4m1 0.01372465 -0.05045147 0.029447542 -0.04320115 0.91486865 0.047203683
#> Ap4s1 -0.03240274 0.09531613 0.009052527 -0.02565100 0.04720368 0.908152183
# For a partial matrix
coex <- getGenesCoex(obj, genes = c("Satb2", "Bcl11b", "Fezf2"))
coex[1000L:1005L, ]
#> 6 x 3 Matrix of class "dgeMatrix"
#> Bcl11b Fezf2 Satb2
#> Ap3s1 -0.04615274 -0.003499993 0.01372725
#> Ap3s2 0.01281042 0.026728977 0.01784518
#> Ap4b1 0.04062971 0.010720802 -0.03173374
#> Ap4e1 -0.05290965 -0.015074415 -0.03450185
#> Ap4m1 0.05811928 0.020840815 -0.03963970
#> Ap4s1 -0.06858325 0.006087159 -0.01587705
COTAN
provides a way to establish genes’ clusters given some lists of markers
layersGenes <- list(
"L1" = c("Reln", "Lhx5"),
"L2/3" = c("Satb2", "Cux1"),
"L4" = c("Rorb", "Sox5"),
"L5/6" = c("Bcl11b", "Fezf2"),
"Prog" = c("Vim", "Hes1")
)
c(gSpace, eigPlot, pcaGenesClDF, treePlot) %<-%
establishGenesClusters(obj, groupMarkers = layersGenes,
numGenesPerMarker = 25L, kCuts = 5L)
#> Establishing gene clusters - START
#> Calculating gene co-expression space - START
#> calculating PValues: START
#> Get p-values on a set of genes on columns and genome wide on rows
#> calculating PValues: DONE
#> Number of selected secondary markers: 184
#> Calculating gene co-expression space - DONE
#> Establishing gene clusters - DONE
plot(eigPlot)
plot(treePlot)
colSelection <- vapply(pcaGenesClDF, is.numeric, logical(1L))
genesUmapPl <- UMAPPlot(pcaGenesClDF[, colSelection, drop = FALSE],
clusters = getColumnFromDF(pcaGenesClDF, "hclust"),
elements = layersGenes,
title = "Genes' clusters UMAP Plot",
numNeighbors = 32L, minPointsDist = 0.25)
#> UMAP plot
#> [2024-11-13 16:46:30.423002] starting umap
#> [2024-11-13 16:46:30.467835] creating graph of nearest neighbors
#> Found more than one class "dist" in cache; using the first, from namespace 'BiocGenerics'
#> Also defined by 'spam'
#> [2024-11-13 16:46:32.494569] creating initial embedding
#> [2024-11-13 16:46:32.677751] optimizing embedding
#> [2024-11-13 16:46:37.231683] done
plot(genesUmapPl)
It is possible to obtain a cell clusterization based on the concept of
uniformity of expression of the genes across the cells.
That is the cluster satisfies the null hypothesis of the COTAN
model:
the genes expression is not dependent on the cell in consideration.
There are two functions involved into obtaining a proper clusterization:
the first is cellsUniformClustering
that uses standard tools clusterization
methods, but then discards and re-clusters any non-uniform cluster.
Please note that the most important parameters for the users are the
GDIThreshold
s inside the Uniform Transcript checkers: they define how
strict is the check. Default constructed advance check gives a pretty strong
guarantee of uniformity for the cluster.
# This code is a little too computationally heavy to be used in an example
# So we stored the result and we can load it in the next section
# default constructed checker is OK
advChecker <- new("AdvancedGDIUniformityCheck")
c(splitClusters, splitCoexDF) %<-%
cellsUniformClustering(obj, initialResolution = 0.8, checker = advChecker,
optimizeForSpeed = TRUE, deviceStr = "cuda",
cores = 6L, saveObj = TRUE, outDir = outDir)
obj <- addClusterization(obj, clName = "split",
clusters = splitClusters, coexDF = splitCoexDF)
table(splitClusters)
In the case one already has an existing clusterization, it is possible to
calculate the DEA data.frame
and add it to the COTAN
object.
data("vignette.split.clusters", package = "COTAN")
splitClusters <- vignette.split.clusters[getCells(obj)]
splitCoexDF <- DEAOnClusters(obj, clusters = splitClusters)
#> Differential Expression Analysis - START
#> **************
#> Differential Expression Analysis - DONE
obj <- addClusterization(obj, clName = "split", clusters = splitClusters,
coexDF = splitCoexDF, override = FALSE)
It is possible to have some statistics about the clusterization
c(summaryData, summaryPlot) %<-%
clustersSummaryPlot(obj, clName = "split", plotTitle = "split summary")
summaryData
#> split NoCond CellNumber CellPercentage MeanUDE MedianUDE ExpGenes25 ExpGenes
#> 1 -1 NoCond 71 8.3 0.92 0.86 1449 10173
#> 2 01 NoCond 237 27.8 0.80 0.76 1201 11274
#> 3 02 NoCond 168 19.7 1.46 1.39 2223 11578
#> 4 03 NoCond 28 3.3 1.40 1.38 2104 8851
#> 5 04 NoCond 36 4.2 1.13 1.09 1709 9113
#> 6 05 NoCond 32 3.8 1.66 1.61 2412 9597
#> 7 06 NoCond 20 2.3 0.99 0.90 1452 7996
#> 8 07 NoCond 41 4.8 1.09 0.97 1614 9822
#> 9 08 NoCond 40 4.7 0.61 0.58 840 8012
#> 10 09 NoCond 25 2.9 0.73 0.67 1105 7409
#> 11 10 NoCond 22 2.6 1.38 1.29 2338 8930
#> 12 11 NoCond 49 5.8 0.59 0.55 832 8606
#> 13 12 NoCond 38 4.5 0.44 0.43 490 7845
#> 14 13 NoCond 45 5.3 0.76 0.74 1121 9122
The ExpGenes
column contains the number of genes that are expressed in any
cell of the relevant cluster, while the ExpGenes25
column contains the number of genes expressed in at the least 25% of the cells of the relevant cluster
plot(summaryPlot)
It is possible to visualize how relevant are the marker genes’ lists
with
respect to the given clusterization
c(splitHeatmap, scoreDF, pValueDF) %<-%
clustersMarkersHeatmapPlot(obj, groupMarkers = layersGenes,
clName = "split", kCuts = 5L,
adjustmentMethod = "holm")
draw(splitHeatmap)
Since the algorithm that creates the clusters is not directly geared to achieve cluster uniformity, there might be some clusters that can be merged back together and still be uniform.
This is the purpose of the function mergeUniformCellsClusters
that takes a
clusterization and tries to merge cluster pairs after checking that together
the pair forms a uniform cluster.
In order to avoid running the totality of the possible checks (as it can explode quickly with the number of clusters), the function relies on a related distance the find the cluster pairs that have the highest chance to be merged.
c(mergedClusters, mergedCoexDF) %<-%
mergeUniformCellsClusters(obj, clusters = splitClusters,
checkers = advChecker,
optimizeForSpeed = TRUE, deviceStr = "cuda",
cores = 6L, saveObj = TRUE, outDir = outDir)
# merges are:
# 1 <- 06 + 07
# 2 <- '-1' + 08 + 11
# 3 <- 09 + 10
# 4 <- 01
# 5 <- 02
# 6 <- 12 + 13
# 7 <- 03 + 05
# 8 <- 04
obj <- addClusterization(obj, clName = "merge", override = TRUE,
clusters = mergedClusters, coexDF = mergedCoexDF)
table(mergedClusters)
Again, here, the most important parameters for the users are the GDIThreshold
s
inside the Uniform Transcript checkers: they define how strict is the
check. Default constructed advance check gives a pretty strong guarantee of
uniformity for the cluster. If one wants to achieve a more relaxed merge,
it is possible to define a sequence of checkers, each less stringent than the
previous, that will be applied sequentially: First all the merges with the
stricter checker are performed, than those that satisfy the second, and so on.
checkersList <- list(advChecker,
shiftCheckerThresholds(advChecker, 0.01),
shiftCheckerThresholds(advChecker, 0.03))
prevCheckRes <- data.frame()
# In this case we want to re-use the already calculated merge checks
# Se we reload them from the output files. This, along a similar facility for
# the split method, is also useful in the cases the execution is interrupted
# prematurely...
#
if (TRUE) {
# read from the last file among those named all_check_results_XX.csv
mergeDir <- file.path(outDir, cond, "leafs_merge")
resFiles <- list.files(path = mergeDir, pattern = "all_check_results_.*csv",
full.names = TRUE)
prevCheckRes <- read.csv(resFiles[length(resFiles)],
header = TRUE, row.names = 1L)
}
c(mergedClusters2, mergedCoexDF2) %<-%
mergeUniformCellsClusters(obj, clusters = splitClusters,
checkers = checkersList,
allCheckResults = prevCheckRes,
optimizeForSpeed = TRUE, deviceStr = "cuda",
cores = 6L, saveObj = TRUE, outDir = outDir)
# merges are:
# 1 <- '-1' + 06 + 09
# 2 <- 07
# 3 <- 03 + 04 + 05 + 10 + 13
# 4 <- 12
# 5 <- 01 + 08 + 11
# 6 <- 02
obj <- addClusterization(obj, clName = "merge2", override = TRUE,
clusters = mergedClusters2, coexDF = mergedCoexDF2)
table(mergedClusters2)
In the case one already has existing clusterizations, it is possible to
calculate their DEA data.frame
and add them to the COTAN
object.
data("vignette.merge.clusters", package = "COTAN")
mergedClusters <- vignette.merge.clusters[getCells(obj)]
mergedCoexDF <- DEAOnClusters(obj, clusters = mergedClusters)
#> Differential Expression Analysis - START
#> ********
#> Differential Expression Analysis - DONE
obj <- addClusterization(obj, clName = "merge", clusters = mergedClusters,
coexDF = mergedCoexDF, override = FALSE)
data("vignette.merge2.clusters", package = "COTAN")
mergedClusters2 <- vignette.merge2.clusters[getCells(obj)]
mergedCoexDF2 <- DEAOnClusters(obj, clusters = mergedClusters2)
#> Differential Expression Analysis - START
#> ******
#> Differential Expression Analysis - DONE
obj <- addClusterization(obj, clName = "merge2", clusters = mergedClusters2,
coexDF = mergedCoexDF2, override = FALSE)
Here is possible to visualize how the split
clusters are merged in to
merge
and merge2
c(mergeHeatmap, ...) %<-%
clustersMarkersHeatmapPlot(obj, clName = "merge", condNameList = "split",
conditionsList = list(splitClusters))
draw(mergeHeatmap)
c(mergeHeatmap2, ...) %<-%
clustersMarkersHeatmapPlot(obj, clName = "merge2", condNameList = "split",
conditionsList = list(splitClusters))
draw(mergeHeatmap2)
In the following graph, it is possible to check that the found clusters align very well to the expression of the layers’ genes defined above…
It is possible to plot the clusterization on a UMAP
plot
c(umapPlot, cellsPCA) %<-% cellsUMAPPlot(obj, clName = "merge2",
dataMethod = "AdjBinarized",
genesSel = "HGDI",
colors = NULL, numNeighbors = 15L,
minPointsDist = 0.2)
#> Selected 1805 genes using HGDI selector
#> UMAP plot
#> [2024-11-13 16:46:50.387073] starting umap
#> [2024-11-13 16:46:50.432142] creating graph of nearest neighbors
#> Found more than one class "dist" in cache; using the first, from namespace 'BiocGenerics'
#> Also defined by 'spam'
#> [2024-11-13 16:46:51.281568] creating initial embedding
#> [2024-11-13 16:46:51.358446] optimizing embedding
#> [2024-11-13 16:46:54.179354] done
plot(umapPlot)
#> Warning: No shared levels found between `names(values)` of the manual scale and the
#> data's fill values.
The next few lines are just to clean.
if (file.exists(file.path(outDir, paste0(cond, ".cotan.RDS")))) {
#Delete file if it exists
file.remove(file.path(outDir, paste0(cond, ".cotan.RDS")))
}
if (file.exists(file.path(outDir, paste0(cond, "_times.csv")))) {
#Delete file if it exists
file.remove(file.path(outDir, paste0(cond, "_times.csv")))
}
if (dir.exists(file.path(outDir, cond))) {
unlink(file.path(outDir, cond), recursive = TRUE)
}
if (dir.exists(file.path(outDir, GEO))) {
unlink(file.path(outDir, GEO), recursive = TRUE)
}
# stop logging to file
setLoggingFile("")
#> Closing previous log file - Setting log file to be:
file.remove(file.path(outDir, "vignette_v2.log"))
#> [1] TRUE
options(parallelly.fork.enable = FALSE)
Sys.time()
#> [1] "2024-11-13 16:46:54 EST"
sessionInfo()
#> R Under development (unstable) (2024-10-21 r87258)
#> Platform: x86_64-pc-linux-gnu
#> Running under: Ubuntu 24.04.1 LTS
#>
#> Matrix products: default
#> BLAS: /home/biocbuild/bbs-3.21-bioc/R/lib/libRblas.so
#> LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.12.0
#>
#> locale:
#> [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
#> [3] LC_TIME=en_GB LC_COLLATE=C
#> [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
#> [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
#> [9] LC_ADDRESS=C LC_TELEPHONE=C
#> [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
#>
#> time zone: America/New_York
#> tzcode source: system (glibc)
#>
#> attached base packages:
#> [1] grid stats graphics grDevices utils datasets methods
#> [8] base
#>
#> other attached packages:
#> [1] ComplexHeatmap_2.23.0 GEOquery_2.75.0 Biobase_2.67.0
#> [4] BiocGenerics_0.53.2 generics_0.1.3 qpdf_1.3.4
#> [7] Rtsne_0.17 data.table_1.16.2 rlang_1.1.4
#> [10] zeallot_0.1.0 COTAN_2.7.1 BiocStyle_2.35.0
#>
#> loaded via a namespace (and not attached):
#> [1] matrixStats_1.4.1 spatstat.sparse_3.1-0
#> [3] httr_1.4.7 RColorBrewer_1.1-3
#> [5] doParallel_1.0.17 tools_4.5.0
#> [7] sctransform_0.4.1 utf8_1.2.4
#> [9] R6_2.5.1 lazyeval_0.2.2
#> [11] uwot_0.2.2 GetoptLong_1.0.5
#> [13] withr_3.0.2 sp_2.1-4
#> [15] gridExtra_2.3 parallelDist_0.2.6
#> [17] progressr_0.15.0 cli_3.6.3
#> [19] Cairo_1.6-2 spatstat.explore_3.3-3
#> [21] fastDummies_1.7.4 labeling_0.4.3
#> [23] sass_0.4.9 Seurat_5.1.0
#> [25] spatstat.data_3.1-2 readr_2.1.5
#> [27] ggridges_0.5.6 pbapply_1.7-2
#> [29] askpass_1.2.1 rentrez_1.2.3
#> [31] parallelly_1.39.0 limma_3.63.2
#> [33] torch_0.13.0 shape_1.4.6.1
#> [35] ica_1.0-3 spatstat.random_3.3-2
#> [37] dplyr_1.1.4 dendextend_1.18.1
#> [39] Matrix_1.7-1 fansi_1.0.6
#> [41] S4Vectors_0.45.1 abind_1.4-8
#> [43] PCAtools_2.19.0 lifecycle_1.0.4
#> [45] yaml_2.3.10 SummarizedExperiment_1.37.0
#> [47] SparseArray_1.7.1 promises_1.3.0
#> [49] dqrng_0.4.1 crayon_1.5.3
#> [51] miniUI_0.1.1.1 lattice_0.22-6
#> [53] beachmat_2.23.1 cowplot_1.1.3
#> [55] magick_2.8.5 pillar_1.9.0
#> [57] knitr_1.49 GenomicRanges_1.59.0
#> [59] rjson_0.2.23 future.apply_1.11.3
#> [61] codetools_0.2-20 leiden_0.4.3.1
#> [63] glue_1.8.0 spatstat.univar_3.1-1
#> [65] vctrs_0.6.5 png_0.1-8
#> [67] spam_2.11-0 gtable_0.3.6
#> [69] assertthat_0.2.1 cachem_1.1.0
#> [71] xfun_0.49 S4Arrays_1.7.1
#> [73] mime_0.12 Rfast_2.1.0
#> [75] survival_3.7-0 SingleCellExperiment_1.29.1
#> [77] iterators_1.0.14 tinytex_0.54
#> [79] statmod_1.5.0 fitdistrplus_1.2-1
#> [81] ROCR_1.0-11 nlme_3.1-166
#> [83] bit64_4.5.2 RcppAnnoy_0.0.22
#> [85] GenomeInfoDb_1.43.0 bslib_0.8.0
#> [87] irlba_2.3.5.1 KernSmooth_2.23-24
#> [89] colorspace_2.1-1 tidyselect_1.2.1
#> [91] processx_3.8.4 bit_4.5.0
#> [93] compiler_4.5.0 curl_6.0.0
#> [95] xml2_1.3.6 DelayedArray_0.33.1
#> [97] plotly_4.10.4 bookdown_0.41
#> [99] scales_1.3.0 lmtest_0.9-40
#> [101] callr_3.7.6 stringr_1.5.1
#> [103] digest_0.6.37 goftest_1.2-3
#> [105] spatstat.utils_3.1-1 rmarkdown_2.29
#> [107] XVector_0.47.0 htmltools_0.5.8.1
#> [109] pkgconfig_2.0.3 coro_1.1.0
#> [111] umap_0.2.10.0 sparseMatrixStats_1.19.0
#> [113] MatrixGenerics_1.19.0 fastmap_1.2.0
#> [115] GlobalOptions_0.1.2 htmlwidgets_1.6.4
#> [117] ggthemes_5.1.0 UCSC.utils_1.3.0
#> [119] shiny_1.9.1 DelayedMatrixStats_1.29.0
#> [121] farver_2.1.2 jquerylib_0.1.4
#> [123] zoo_1.8-12 jsonlite_1.8.9
#> [125] BiocParallel_1.41.0 BiocSingular_1.23.0
#> [127] magrittr_2.0.3 GenomeInfoDbData_1.2.13
#> [129] dotCall64_1.2 patchwork_1.3.0
#> [131] munsell_0.5.1 Rcpp_1.0.13-1
#> [133] viridis_0.6.5 reticulate_1.39.0
#> [135] RcppZiggurat_0.1.6 stringi_1.8.4
#> [137] zlibbioc_1.53.0 MASS_7.3-61
#> [139] plyr_1.8.9 parallel_4.5.0
#> [141] listenv_0.9.1 ggrepel_0.9.6
#> [143] deldir_2.0-4 splines_4.5.0
#> [145] tensor_1.5 hms_1.1.3
#> [147] circlize_0.4.16 ps_1.8.1
#> [149] igraph_2.1.1 spatstat.geom_3.3-3
#> [151] RcppHNSW_0.6.0 reshape2_1.4.4
#> [153] stats4_4.5.0 ScaledMatrix_1.15.0
#> [155] XML_3.99-0.17 evaluate_1.0.1
#> [157] SeuratObject_5.0.2 RcppParallel_5.1.9
#> [159] BiocManager_1.30.25 tzdb_0.4.0
#> [161] foreach_1.5.2 httpuv_1.6.15
#> [163] RANN_2.6.2 tidyr_1.3.1
#> [165] openssl_2.2.2 purrr_1.0.2
#> [167] polyclip_1.10-7 future_1.34.0
#> [169] clue_0.3-66 scattermore_1.2
#> [171] ggplot2_3.5.1 rsvd_1.0.5
#> [173] xtable_1.8-4 RSpectra_0.16-2
#> [175] later_1.3.2 viridisLite_0.4.2
#> [177] tibble_3.2.1 IRanges_2.41.0
#> [179] cluster_2.1.6 globals_0.16.3