## ----load-libs, message = FALSE,  warning = FALSE-----------------------------
library(cowplot)
library(dplyr)
library(reshape2)
library(muscat)
library(purrr)
library(scater)
library(SingleCellExperiment)

## ----prepSim------------------------------------------------------------------
# estimate simulation parameters
data(example_sce)
ref <- prepSim(example_sce, verbose = FALSE)
# only samples from `ctrl` group are retained
table(ref$sample_id)
# cell parameters: library sizes
sub <- assay(example_sce[rownames(ref), colnames(ref)])
all.equal(exp(ref$offset), as.numeric(colSums(sub)))
# gene parameters: dispersions & sample-specific means
head(rowData(ref))

## ----simData------------------------------------------------------------------
# simulated 10% EE genes
sim <- simData(ref, p_dd = diag(6)[1, ],
    nk = 3, ns = 3, nc = 2e3,
    ng = 1e3, force = TRUE)
# number of cells per sample and subpopulation
table(sim$sample_id, sim$cluster_id)

## ----paired-false-------------------------------------------------------------
metadata(sim)$ref_sids

## ----paired-true--------------------------------------------------------------
# simulated paired design
sim <- simData(ref, paired = TRUE, 
    nk = 3, ns = 3, nc = 2e3,
    ng = 1e3, force = TRUE)
# same set of reference samples for both groups
ref_sids <- metadata(sim)$ref_sids
all(ref_sids[, 1] == ref_sids[, 2])

## ----p_dd---------------------------------------------------------------------
# simulate genes from all DD categories
sim <- simData(ref, p_dd = c(0.5, rep(0.1, 5)),
    nc = 2e3, ng = 1e3, force = TRUE)

## ----gi_category--------------------------------------------------------------
gi <- metadata(sim)$gene_info
table(gi$category)

## ----densities, echo = FALSE, fig.wide = TRUE, fig.width = 8, fig.height = 3, fig.cap = "Expression densities for an exemplary set of 3 genes per *differential distribution* category. Each density corresponds to one sample, lines are colored by group ID, and panels are split by gene and subpopulation."----
# simulate genes from all DD categories
sim <- simData(ref, 
    nc = 2e3, nk = 1, ns = 4,
    p_dd = c(0.5, rep(0.1, 5)),
    ng = 1e3, force = TRUE)
# normalize 
sim <- logNormCounts(sim)
# get 'n' genes per category
n <- 3
gi <- metadata(sim)$gene_info %>% 
    mutate(sim_mean = (sim_mean.A+sim_mean.B)/2) %>% 
    filter(is.na(logFC) | abs(logFC) > 2, sim_mean > 1) 
gs <- group_by(gi, category) %>% 
    group_modify(~head(.x, n = n)) %>% 
    mutate(id = paste0(gene, cluster_id))
# construct data.frame for ggplot
df <- data.frame(t(logcounts(sim)), colData(sim)) %>% 
    melt(id.vars = names(colData(sim))) %>% 
    mutate(id = paste0(variable, cluster_id)) %>% 
    mutate(id = factor(id, levels = gs$id)) %>% 
    filter(id %in% gs$id) %>% 
    mutate(cat = gs$category[match(id, gs$id)])
# use category as facet label
labs <- setNames(toupper(df$cat), df$id)
labs <- labs[unique(names(labs))]
# keep labels only for top row
labs_keep <- levels(df$id)[seq(1, length(labs), n)]
labs[setdiff(names(labs), labs_keep)] <- ""
labs <- as_labeller(labs)
# plot expression densities
ggplot(df, aes(x = value, col = group_id)) +
    facet_wrap("id", scales = "free",
        dir = "v", ncol = 6, labeller = labs) +
    geom_density() + xlab("expression") + 
    theme_minimal() + theme(
        axis.text = element_blank(),
        axis.ticks = element_blank(),
        panel.spacing = unit(0, "mm"),
        panel.grid = element_blank())

## ----rel-lfc, echo = FALSE, fig.wide = TRUE, fig.cap = "t-SNEs of exemplary simulations demonstrating `rel_lfc`'s effect to induce cluster-specific state changes. Cells are colored by cluster ID (top-row) and group ID (bottom-row), respectively. From left to right: No cluster-specific changes, stronger change for `cluster3`, different logFC factors for all clusters with no change for `cluster1`."----
rel_lfc <- list(
    c(1, 1, 1), # same FC factor for all clusters
    c(1, 1, 2), # stronger change for cluster3 only
    c(0, 1, 2)) # cluster-specific logFC factors; no change for cluster1
sim <- lapply(rel_lfc, function(u)
    simData(ref, rel_lfc = u,
        nc = (nc <- 1e3), nk = 3,
        p_dd = c(0.95, 0, 0.05, 0, 0, 0),
        ng = 1e3, force = TRUE))
# normalize & run dimension reduction
sim <- lapply(sim, logNormCounts)
sim <- lapply(sim, runTSNE)
# arrange plots
ps <- lapply(c("cluster_id", "group_id"), 
    function(id) lapply(sim, function(u) {
        p <- plotTSNE(u, colour_by = id)
        p$layers[[1]]$aes_params$stroke <- 0
        p + guides(fill = guide_legend(
            override.aes = list(alpha = 1, size = 3)))
    }))
ps <- Reduce("c", ps)
lgd <- lapply(ps[c(1, 4)], get_legend)
ps <- lapply(ps, "+", theme(legend.position = "none"))
plot_grid(nrow = 1, rel_widths = c(8, 1),
    plot_grid(plotlist = ps, ncol = 3, align = "hv"),
    plot_grid(plotlist = lgd, ncol = 1))

## ----p-type, echo = FALSE, fig.wide = TRUE, fig.cap = "t-SNEs of exemplary simulations demonstrating `p_type`'s effect to introduce *type* features. Cells are colored by cluster ID (top-row) and group ID (bottom-row), respectively. The percentage of type features increases from left to right (1, 5, 10%). Simulations are pure EE, i.e., all genes are non-differential across groups."----
sim <- lapply(c(0.01, 0.05, 0.1), function(u)
    simData(ref, p_type = u, 
        nc = 1e3, nk = 3, 
        ng = 1e3, force = TRUE))
# normalize & run dimension reduction
sim <- lapply(sim, logNormCounts)
sim <- lapply(sim, runTSNE)
# arrange plots
# arrange plots
ps <- lapply(c("cluster_id", "group_id"), 
    function(id) lapply(sim, function(u) {
        p <- plotTSNE(u, colour_by = id)
        p$layers[[1]]$aes_params$stroke <- 0
        p + guides(fill = guide_legend(
            override.aes = list(alpha = 1, size = 3)))
    }))
ps <- Reduce("c", ps)
lgd <- lapply(ps[c(1, 4)], get_legend)
ps <- lapply(ps, "+", theme(legend.position = "none"))
plot_grid(nrow = 1, rel_widths = c(5, 1),
    plot_grid(plotlist = ps, ncol = 3, align = "hv"),
    plot_grid(plotlist = lgd, ncol = 1))

## ----simData-type-------------------------------------------------------------
# simulate 5% of type genes; one group only
sim <- simData(ref, p_type = 0.1, 
    nc = 2e3, ng = 1e3, force = TRUE,
    probs = list(NULL, NULL, c(1, 0)))
# do log-library size normalization
sim <- logNormCounts(sim)

## ----heatmap-type, fig.width = 12, fig.height = 8, fig.cap = "Exemplary heatmap demonstrating the effect of `p_type` to introduce cluster-specific *type* genes. Included are type genes (= rows) with a simulated expression mean > 1, and a random subset of 100 cells (= columns) per cluster; column annotations represent cluster IDs. Bins are colored by expression scaled in row direction, and both genes and cells are hierarchically clustered."----
# extract gene metadata & number of clusters
rd <- rowData(sim)
nk <- nlevels(sim$cluster_id)
# filter for type genes with high expression mean
is_type <- rd$class == "type"
is_high <- rowMeans(assay(sim, "logcounts")) > 1
gs <- rownames(sim)[is_type & is_high]
# sample 100 cells per cluster for plotting
cs <- lapply(split(seq_len(ncol(sim)), sim$cluster_id), sample, 100)
plotHeatmap(sim[, unlist(cs)], features = gs, center = TRUE,
    colour_columns_by = "cluster_id", cutree_cols = nk)

## ----phylo-tree, fig.small = TRUE, fig.height = 4, fig.cap = "Exemplary phylogeny. The phylogenetic tree specified via `phylo` relates 3 clusters such that there are 2 main branches, and clusters 1 and 2 should be more similar to one another than cluster 3."----
# specify cluster phylogeny 
tree <- "(('cluster1':0.4,'cluster2':0.4):0.4,('cluster3':
    0.5,('cluster4':0.2,'cluster5':0.2,'cluster6':0.2):0.4):0.4);"
# visualize cluster tree
library(phylogram)
plot(read.dendrogram(text = tree))

## ----simData-phylo------------------------------------------------------------
# simulate 5% of type genes; one group only
sim <- simData(ref, 
    phylo_tree = tree, phylo_pars = c(0.1, 1),
    nc = 800, ng = 1e3, dd = FALSE, force = TRUE)
# do log-library size normalization
sim <- logNormCounts(sim)

## ----heatmap-phylo, fig.width = 12, fig.height = 8, fig.cap = "Exemplary heatmap demonstrating the effect of `phylo_tree` to introduce a hierarchical cluster structure. Included are 100 randomly sampled non-state, i.e. type or shared, genes (= rows) with a simulated expression mean > 1, and a random subset of 100 cells (= columns) per cluster; column annotations represent cluster IDs. Bins are colored by expression scaled in row direction, and both genes and cells are hierarchically clustered."----
# extract gene metadata & number of clusters
rd <- rowData(sim)
nk <- nlevels(sim$cluster_id)
# filter for type & shared genes with high expression mean
is_type <- rd$class != "state"
is_high <- rowMeans(assay(sim, "logcounts")) > 1
gs <- rownames(sim)[is_type & is_high]
# sample 100 cells per cluster for plotting
cs <- lapply(split(seq_len(ncol(sim)), sim$cluster_id), sample, 50)
plotHeatmap(sim[, unlist(cs)], features = gs, 
    center = TRUE, show_rownames = FALSE,
    colour_columns_by = "cluster_id")

## ----countsimQC, eval = FALSE-------------------------------------------------
#  # load required packages
#  library(countsimQC)
#  library(DESeq2)
#  # simulate data
#  sim <- simData(ref,
#      ng = nrow(ref),
#      nc = ncol(ref),
#      dd = FALSE)
#  # construct 'DESeqDataSet's for both,
#  # simulated and reference dataset
#  dds_sim <- DESeqDataSetFromMatrix(
#      countData = counts(sim),
#      colData = colData(sim),
#      design = ~ sample_id)
#  dds_ref <- DESeqDataSetFromMatrix(
#      countData = counts(ref),
#      colData = colData(ref),
#      design = ~ sample_id)
#  dds_list <- list(sim = dds_sim, data = dds_ref)
#  # generate 'countsimQC' report
#  countsimQCReport(
#      ddsList = dds_list,
#      outputFile = "<file_name>.html",
#      outputDir = "<output_path>",
#      outputFormat = "html_document",
#      maxNForCorr = 200,
#      maxNForDisp = 500)

## ----iCOBRA-run-method-wrapper------------------------------------------------
# 'm' is a character string specifying a valid `pbDS` method
.run_method <- function(m) {
    res <- pbDS(pb, method = m, verbose = FALSE)
    tbl <- resDS(sim, res)
    left_join(gi, tbl, by = c("gene", "cluster_id"))
}

## ----iCOBRA-calc-perf-wrapper-------------------------------------------------
# 'x' is a list of result 'data.frame's
.calc_perf <- function(x, facet = NULL) {
    cd <- COBRAData(truth = gi,
        pval = data.frame(bind_cols(map(x, "p_val"))),
        padj = data.frame(bind_cols(map(x, "p_adj.loc"))))
    perf <- calculate_performance(cd, 
        binary_truth = "is_de", maxsplit = 1e6,
        splv = ifelse(is.null(facet), "none", facet),
        aspects = c("fdrtpr", "fdrtprcurve", "curve"))
}

## ----iCOBRA-comparison, warning = FALSE, message = FALSE, fig.height = 4, fig.width = 9----
# simulation with all DD types
sim <- simData(ref, 
    p_dd = c(rep(0.3, 2), rep(0.1, 4)),
    ng = 1e3, nc = 2e3, ns = 3, force = TRUE)
# aggregate to pseudobulks
pb <- aggregateData(sim)
# extract gene metadata
gi <- metadata(sim)$gene_info
# add truth column (must be numeric!)
gi$is_de <- !gi$category %in% c("ee", "ep")
gi$is_de <- as.numeric(gi$is_de) 

# specify methods for comparison & run them
# (must set names for methods to show in visualizations!)
names(mids) <- mids <- c("edgeR", "DESeq2", "limma-trend", "limma-voom")
res <- lapply(mids, .run_method)

# calculate performance measures 
# and prep. for plotting with 'iCOBRA'
library(iCOBRA)
perf <- .calc_perf(res, "cluster_id")
pd <- prepare_data_for_plot(perf)

# plot FDR-TPR curves by cluster
plot_fdrtprcurve(pd) +
    theme(aspect.ratio = 1) +
    scale_x_continuous(trans = "sqrt") +
    facet_wrap(~ splitval, nrow = 1)

## ----session-info-------------------------------------------------------------
sessionInfo()