## ----setup, echo=FALSE, results="hide"------------------------------------------------------------ knitr::opts_chunk$set( tidy = FALSE, cache = TRUE, fig.align = 'center', dev = "png", package.startup.message = FALSE, message = FALSE, error = FALSE, warning = TRUE ) options(width = 100) ## ----simResult, cache=TRUE, fig.height=4, fig.width=4--------------------------------------------- # load library library("variancePartition") # load simulated data: # geneExpr: matrix of gene expression values # info: information/metadata about each sample data(varPartData) # Specify variables to consider # Age is continuous so model it as a fixed effect # Individual and Tissue are both categorical, # so model them as random effects # Note the syntax used to specify random effects form <- ~ Age + (1 | Individual) + (1 | Tissue) + (1 | Batch) # Fit model and extract results # 1) fit linear mixed model on gene expression # If categorical variables are specified, # a linear mixed model is used # If all variables are modeled as fixed effects, # a linear model is used # each entry in results is a regression model fit on a single gene # 2) extract variance fractions from each model fit # for each gene, returns fraction of variation attributable # to each variable # Interpretation: the variance explained by each variables # after correcting for all other variables # Note that geneExpr can either be a matrix, # and EList output by voom() in the limma package, # or an ExpressionSet varPart <- fitExtractVarPartModel(geneExpr, form, info) # sort variables (i.e. columns) by median fraction # of variance explained vp <- sortCols(varPart) # Figure 1a # Bar plot of variance fractions for the first 10 genes plotPercentBars(vp[1:10, ]) # Figure 1b # violin plot of contribution of each variable to total variance plotVarPart(vp) ## ----accessResults, cache=TRUE, warning=FALSE----------------------------------------------------- # Access first entries head(varPart) # Access first entries for Individual head(varPart$Individual) # sort genes based on variance explained by Individual head(varPart[order(varPart$Individual, decreasing = TRUE), ]) ## ----savePlot, cache=TRUE, eval=FALSE------------------------------------------------------------- # fig <- plotVarPart(vp) # ggsave(file, fig) ## ----plotStratify, cache=TRUE, warning=FALSE, fig.height=4, fig.width=4--------------------------- # get gene with the highest variation across Tissues # create data.frame with expression of gene i and Tissue # type for each sample i <- which.max(varPart$Tissue) GE <- data.frame(Expression = geneExpr[i, ], Tissue = info$Tissue) # Figure 2a # plot expression stratified by Tissue plotStratify(Expression ~ Tissue, GE, main = rownames(geneExpr)[i]) # get gene with the highest variation across Individuals # create data.frame with expression of gene i and Tissue # type for each sample i <- which.max(varPart$Individual) GE <- data.frame( Expression = geneExpr[i, ], Individual = info$Individual ) # Figure 2b # plot expression stratified by Tissue label <- paste("Individual:", format(varPart$Individual[i] * 100, digits = 3 ), "%") main <- rownames(geneExpr)[i] plotStratify(Expression ~ Individual, GE, colorBy = NULL, text = label, main = main ) ## ----cache=TRUE----------------------------------------------------------------------------------- library("lme4") # fit regression model for the first gene form_test <- geneExpr[1, ] ~ Age + (1 | Individual) + (1 | Tissue) fit <- lmer(form_test, info, REML = FALSE) # extract variance statistics calcVarPart(fit) ## ----canCorPairs, cache=TRUE, results='hide', fig.width=5, fig.height=5--------------------------- form <- ~ Individual + Tissue + Batch + Age + Height # Compute Canonical Correlation Analysis (CCA) # between all pairs of variables # returns absolute correlation value C <- canCorPairs(form, info) # Plot correlation matrix # between all pairs of variables plotCorrMatrix(C) ## ----simResult-omit, cache=TRUE, results='hide'--------------------------------------------------- form <- ~ Age + (1 | Individual) + (1 | Tissue) + (1 | Batch) # Fit model results <- fitVarPartModel(geneExpr, form, info) # Extract results varPart <- extractVarPart(results) ## ----simResult-fast-two-step, cache=TRUE, results='hide'------------------------------------------ # Fit model and run summary() function on each model fit vpSummaries <- fitVarPartModel(geneExpr, form, info, fxn = summary) ## ----simResult-fast-two-step1, cache=TRUE--------------------------------------------------------- # Show results of summary() for the first gene vpSummaries[[1]] ## ----vpa1, echo=TRUE, cache=TRUE, results='hide'-------------------------------------------------- form <- ~ (1 | Tissue) + (1 | Individual) + (1 | Batch) + Age varPart <- fitExtractVarPartModel(geneExpr, form, info) ## ----vpa2, echo=TRUE, cache=TRUE, results='hide'-------------------------------------------------- library("limma") # subtract out effect of Batch fit <- lmFit(geneExpr, model.matrix(~Batch, info)) res <- residuals(fit, geneExpr) # fit model on residuals form <- ~ (1 | Tissue) + (1 | Individual) + Age varPartResid <- fitExtractVarPartModel(res, form, info) ## ----vpa3, echo=TRUE, cache=TRUE, results='hide'-------------------------------------------------- # subtract out effect of Batch with linear mixed model modelFit <- fitVarPartModel(geneExpr, ~ (1 | Batch), info) res <- residuals(modelFit) # fit model on residuals form <- ~ (1 | Tissue) + (1 | Individual) + Age varPartResid <- fitExtractVarPartModel(res, form, info) ## ----vpa4, echo=TRUE, cache=TRUE, results='hide'-------------------------------------------------- # extract residuals directly without storing intermediate results residList <- fitVarPartModel(geneExpr, ~ (1 | Batch), info, fxn = residuals ) # convert list to matrix residMatrix <- do.call(rbind, residList) ## ----withinTissue, echo=TRUE, cache=TRUE, results='hide', fig.height=5, fig.width=4--------------- # specify formula to model within/between individual variance # separately for each tissue # Note that including +0 ensures each tissue is modeled explicitly # Otherwise, the first tissue would be used as baseline form <- ~ (Tissue + 0 | Individual) + Age + (1 | Tissue) + (1 | Batch) # fit model and extract variance percents varPart <- fitExtractVarPartModel(geneExpr, form, info, showWarnings = FALSE) # violin plot plotVarPart(sortCols(varPart), label.angle = 60) ## ----echo=TRUE, cache=TRUE, results='hide'-------------------------------------------------------- form <- ~ (1 | Individual) + (1 | Tissue) + Age + Height # fit model res <- fitVarPartModel(geneExpr[1:4, ], form, info) ## ----echo=TRUE, cache=TRUE------------------------------------------------------------------------ # evaluate the collinearity score on the first model fit # this reports the correlation matrix between coefficient estimates # for fixed effects # the collinearity score is the maximum absolute correlation value # If the collinearity score > .99 then the variance partition # estimates may be problematic # In that case, a least one variable should be omitted colinearityScore(res[[1]]) ## ----echo=TRUE, cache=TRUE, results='hide'-------------------------------------------------------- form <- ~ (1 | Individual) + (1 | Tissue) + Age + Height # Specify custom weights # In this example the weights are simulated from a # uniform distribution and are not meaningful. weights <- matrix(runif(length(geneExpr)), nrow = nrow(geneExpr)) # Specify custom weights res <- fitExtractVarPartModel(geneExpr[1:4, ], form, info, weightsMatrix = weights[1:4, ] ) ## ----vpInteraction, echo=TRUE, cache=TRUE, results='hide', fig.width=4, fig.height=4-------------- form <- ~ (1 | Individual) + Age + Height + (1 | Tissue) + (1 | Batch) + (1 | Batch:Tissue) # fit model vpInteraction <- fitExtractVarPartModel(geneExpr, form, info) plotVarPart(sortCols(vpInteraction)) ## ----limma, echo=TRUE, cache=TRUE, results='hide'------------------------------------------------- library("limma") library("edgeR") # identify genes that pass expression cutoff isexpr <- rowSums(cpm(geneCounts) > 1) >= 0.5 * ncol(geneCounts) # create data structure with only expressed genes gExpr <- DGEList(counts = geneCounts[isexpr, ]) # Perform TMM normalization gExpr <- calcNormFactors(gExpr) # Specify variables to be included in the voom() estimates of # uncertainty. # Recommend including variables with a small number of categories # that explain a substantial amount of variation design <- model.matrix(~Batch, info) # Estimate precision weights for each gene and sample # This models uncertainty in expression measurements vobjGenes <- voom(gExpr, design) # Define formula form <- ~ (1 | Individual) + (1 | Tissue) + (1 | Batch) + Age # variancePartition seamlessly deals with the result of voom() # by default, it seamlessly models the precision weights # This can be turned off with useWeights=FALSE varPart <- fitExtractVarPartModel(vobjGenes, form, info) ## ----DESeq2, echo=TRUE, cache=TRUE, results='hide'------------------------------------------------ library("DESeq2") # create DESeq2 object from gene-level counts and metadata dds <- DESeqDataSetFromMatrix( countData = geneCounts, colData = info, design = ~1 ) # Estimate library size correction scaling factors dds <- estimateSizeFactors(dds) # identify genes that pass expression cutoff isexpr <- rowSums(fpm(dds) > 1) >= 0.5 * ncol(dds) # compute log2 Fragments Per Million # Alternatively, fpkm(), vst() or rlog() could be used quantLog <- log2(fpm(dds)[isexpr, ] + 1) # Define formula form <- ~ (1 | Individual) + (1 | Tissue) + (1 | Batch) + Age # Run variancePartition analysis varPart <- fitExtractVarPartModel(quantLog, form, info) ## ----tximport, echo=TRUE, cache=TRUE, results='hide', eval=FALSE---------------------------------- # library("tximportData") # library("tximport") # library("readr") # # # Get data from folder where tximportData is installed # dir <- system.file("extdata", package = "tximportData") # samples <- read.table(file.path(dir, "samples.txt"), header = TRUE) # files <- file.path(dir, "kallisto", samples$run, "abundance.tsv") # names(files) <- paste0("sample", 1:6) # # tx2gene <- read.csv(file.path(dir, "tx2gene.csv")) # # # reads results from kallisto # txi <- tximport(files, # type = "kallisto", tx2gene = tx2gene, # countsFromAbundance = "lengthScaledTPM" # ) # # # define metadata (usually read from external source) # info_tximport <- data.frame( # Sample = sprintf("sample%d", 1:6), # Disease = c("case", "control")[c(rep(1, 3), rep(2, 3))] # ) # # # Extract counts from kallisto # y <- DGEList(txi$counts) # # # compute library size normalization # y <- calcNormFactors(y) # # # apply voom to estimate precision weights # design <- model.matrix(~Disease, data = info_tximport) # vobj <- voom(y, design) # # # define formula # form <- ~ (1 | Disease) # # # Run variancePartition analysis (on only 10 genes) # varPart_tx <- fitExtractVarPartModel( # vobj[1:10, ], form, # info_tximport # ) ## ----ballgown, echo=TRUE, cache=TRUE, results='hide'---------------------------------------------- library("ballgown") # Get data from folder where ballgown is installed data_directory <- system.file("extdata", package = "ballgown") # Load results of Cufflinks/Tablemaker bg <- ballgown( dataDir = data_directory, samplePattern = "sample", meas = "all" ) # extract gene-level FPKM quantification # Expression can be convert to log2-scale if desired gene_expression <- gexpr(bg) # extract transcript-level FPKM quantification # Expression can be convert to log2-scale if desired transcript_fpkm <- texpr(bg, "FPKM") # define metadata (usually read from external source) info_ballgown <- data.frame( Sample = sprintf("sample%02d", 1:20), Batch = rep(letters[1:4], 5), Disease = c("case", "control")[c(rep(1, 10), rep(2, 10))] ) # define formula form <- ~ (1 | Batch) + (1 | Disease) # Run variancePartition analysis # Gene-level analysis varPart_gene <- fitExtractVarPartModel( gene_expression, form, info_ballgown ) # Transcript-level analysis varPart_transcript <- fitExtractVarPartModel( transcript_fpkm, form, info_ballgown ) ## ----sim, echo=FALSE, cache=TRUE, results='hide'-------------------------------------------------- library("variancePartition") sim_data <- function(n, p, var_indiv, var_site) { info <- data.frame( ID = paste("s", 1:n, sep = ""), Individual = factor(rep(round(seq(1, n / 6, length.out = n / 6)), 6)), Tissue = factor(sort(rep(toupper(letters[1:3]), n / 3))), Age = rpois(n, 50) ) geneExpr <- matrix(0, nrow = p, ncol = n) for (i in 1:p) { beta_indiv <- rnorm(nlevels(info$Individual), 0, sqrt(var_indiv)) beta_site <- rnorm(nlevels(info$Tissue), 0, sqrt(var_site)) eta <- model.matrix(~Individual, info) %*% beta_indiv + model.matrix(~ Tissue + 0, info) %*% beta_site noise <- rbeta(1, 10, 100) errVar <- var(eta) * (noise) / (1 - noise) geneExpr[i, ] <- eta + rnorm(n, 0, sqrt(errVar)) } colnames(geneExpr) <- paste("s", 1:ncol(geneExpr), sep = "") rownames(geneExpr) <- paste("gene", 1:nrow(geneExpr), sep = "") return(list(info = info, geneExpr = geneExpr)) } plotVar <- function(geneExpr, info) { form <- ~ (1 | Individual) + (1 | Tissue) varPart <- suppressWarnings(fitExtractVarPartModel(geneExpr, form, info, showWarnings=FALSE)) plotVarPart(varPart, col = c(rainbow(8)[1:2], "grey85")) + theme(aspect.ratio=1) } plotVarCross <- function(geneExpr, info, label.angle = 30) { form <- ~ (Tissue + 0 | Individual) + (1 | Tissue) # res = fitVarPartModel( geneExpr, form, info ) # varPart = extractVarPart( res ) varPart <- suppressWarnings(fitExtractVarPartModel(geneExpr, form, info, showWarnings = FALSE)) plotVarPart(varPart, label.angle = label.angle) + theme(aspect.ratio=1) } plotPCA <- function(geneExpr, col, main='') { dcmp <- prcomp(t(geneExpr)) par(mar = c(4, 4, 1, 1) + 0.1) plot(dcmp$x[, 1:2], col = col, main=main) } plotTree <- function(geneExpr, col, main=main) { hc <- hclust(dist(t(geneExpr))) library("dendextend") hcd <- as.dendrogram(hc) labels_colors(hcd) <- col[match(labels(hcd), names(col))] par(mar = c(4, 4, 1, 1) + 0.1, cex = .6) plot(hcd, yaxt = "n", horiz = TRUE, main=main) } ## ----siteDominant, echo=FALSE, fig.height=8, fig.width=8, fig.show="hold", warnings=TRUE---------- set.seed(1) n <- 60 p <- 200 data <- sim_data(n, p, 1, 4) colTissue <- rainbow(nlevels(data$info$Tissue))[data$info$Tissue] names(colTissue) <- data$info$ID colIndiv <- rainbow(nlevels(data$info$Individual))[data$info$Individual] names(colIndiv) <- data$info$ID par(mfrow=c(2,2)) plotPCA(data$geneExpr, colTissue, main = "(a) PCA - colored by Tissue") plotPCA(data$geneExpr, colIndiv, main = "(b) PCA - colored by Individual") plotTree(data$geneExpr, colTissue, main ="(c) hclust - colored by Tissue") legend("topleft", legend = levels(data$info$Tissue), fill = rainbow(nlevels(data$info$Tissue))[1:nlevels(data$info$Tissue)], title = "Tissue") plotTree(data$geneExpr, colIndiv, main="(d) hclust - colored by Individual") legend("topleft", legend = levels(data$info$Individual), fill = rainbow(nlevels(data$info$Individual))[1:nlevels(data$info$Individual)], title = "Individual") ## ----cowplot, echo=FALSE, fig.width=8------------------------------------------------------------- fig1 = plotVar(data$geneExpr, data$info) fig2 = plotVarCross(data$geneExpr, data$info, label.angle = 60) cowplot::plot_grid(fig1 + ggtitle("Variance partitioning"), fig2 + ggtitle("variancePartition - within Tissue"), axis="tblr", align="hv") ## ----IndivDominant, echo=FALSE, fig.height=8, fig.width=8----------------------------------------- set.seed(1) n <- 60 p <- 200 data <- sim_data(n, p, 3, 1) colTissue <- rainbow(nlevels(data$info$Tissue))[data$info$Tissue] names(colTissue) <- data$info$ID colIndiv <- rainbow(nlevels(data$info$Individual))[data$info$Individual] names(colIndiv) <- data$info$ID par(mfrow=c(2,2)) plotPCA(data$geneExpr, colTissue, main = "(a) PCA - colored by Tissue") plotPCA(data$geneExpr, colIndiv, main = "(b) PCA - colored by Individual") plotTree(data$geneExpr, colTissue, main ="(c) hclust - colored by Tissue") legend("topleft", legend = levels(data$info$Tissue), fill = rainbow(nlevels(data$info$Tissue))[1:nlevels(data$info$Tissue)], title = "Tissue") plotTree(data$geneExpr, colIndiv, main="(d) hclust - colored by Individual") legend("topleft", legend = levels(data$info$Individual), fill = rainbow(nlevels(data$info$Individual))[1:nlevels(data$info$Individual)], title = "Individual") ## ----cowplot2, echo=FALSE, fig.width=8------------------------------------------------------------ fig1 = plotVar(data$geneExpr, data$info) fig2 = plotVarCross(data$geneExpr, data$info, label.angle = 60) cowplot::plot_grid(fig1 + ggtitle("Variance partitioning"), fig2 + ggtitle("variancePartition - within Tissue"), axis="tblr", align="hv") ## ----stop, echo=FALSE, cache=FALSE---------------------------------------------------------------- # if( "cluster" %in% class(cl) ){ if (exists("cl")) { library("doParallel") # stop cluster and catch warning if invalid res <- tryCatch({ stopCluster(cl) }, warning = function(x) { }, error = function(x) { }, finally = { }) # warning("STOP CLUSTER!!!!!!!!!!!!!!!!!\n") } ## ----DE, echo=FALSE, cache=TRUE, fig.height=5, fig.width=8, warnings=FALSE------------------------ set.seed(1) library("variancePartition") n <- 500 data <- data.frame(Sex = factor(c(rep("F", n), rep("M", n)))) data$expression <- rnorm(2 * n, (as.integer(data$Sex) - 1) * 2, .3) data$Sex <- factor(data$Sex) fit <- lm(expression ~ Sex, data) # calcVarPart(fit) # coef(fit)[2] fig1 = plotStratify(expression ~ Sex, data, ylim = c(-6, 9)) + annotate("text", x = 0.5, y = 9, hjust = 0, label = paste("fold change:", format(coef(fit)[2], digits = 3)), size = 5.5) + annotate("text", x = 0.5, y = 8, hjust = 0, label = paste("% variance of expression:", format(calcVarPart(fit)[1] * 100, digits = 3), "%"), size = 5.5) + ggtitle("Example A") n <- 500 data <- data.frame(Sex = factor(c(rep("F", n), rep("M", n)))) data$expression <- rnorm(2 * n, (as.integer(data$Sex) - 1) * 2, 2.01) data$Sex <- factor(data$Sex) fit <- lm(expression ~ Sex, data) # calcVarPart(fit) # coef(fit)[2] fig2 = plotStratify(expression ~ Sex, data, ylim = c(-6.6, 9)) + annotate("text", x = 0.5, y = 9, hjust = 0, label = paste("fold change:", format(coef(fit)[2], digits = 3)), size = 5.5) + annotate("text", x = 0.5, y = 8, hjust = 0, label = paste("% variance of expression:", format(calcVarPart(fit)[1] * 100, digits = 3), "%"), size = 5.5) + ggtitle("Example B") cowplot::plot_grid(fig1 + theme(aspect.ratio=1), fig2+ theme(aspect.ratio=1)) ## ----session, echo=FALSE-------------------------------------------------------------------------- sessionInfo()