## ----echo=FALSE--------------------------------------------------------------- options(digits=3) ## ----eval=FALSE--------------------------------------------------------------- # if (!requireNamespace("BiocManager")) # install.packages("BiocManager") # BiocManager::install("borealis") ## ----message=FALSE, warning=FALSE--------------------------------------------- library("borealis") # Set data locations outdir <- tempdir() extdata <- system.file("extdata","bismark", package="borealis") # Run borealis results <- runBorealis(extdata,nThreads=2, chrs="chr14", suffix=".gz", outprefix=file.path(outdir,"vignette_borealis_"), modelOutPrefix=file.path(outdir,"vignette_CpG_model")) ## ----------------------------------------------------------------------------- # Read in the name of all files for analysis borealis_in <- dir(outdir,pattern="*DMLs.tsv") length(borealis_in) ## ----message=FALSE, warning=FALSE--------------------------------------------- # Read in list of Borealis output files and create a dataframe for each for (file in borealis_in) { name <- sub("_DMLs.tsv", "", file) assign(name,GenomicRanges::makeGRangesFromDataFrame( read.table(file.path(outdir,file), header=TRUE, stringsAsFactors=FALSE), start.field="pos", end.field="pos.1", seqnames.field="chr", keep.extra.columns=TRUE)) } # Create a list of the newly created dataframes list_object_names <- ls(pattern="borealis_patient") listGRanges <- lapply(list_object_names, get) ## ----------------------------------------------------------------------------- length(list_object_names) list_object_names[1] ## ----------------------------------------------------------------------------- listGRanges[[1]][1,] ## ----------------------------------------------------------------------------- # Add sample ID and a corrected p-value to each and output as new files (.padj) for (i in seq_along(listGRanges)) { sample=sub("_chr.*", "", list_object_names[i]) listGRanges[[i]]$sampleID <- sample listGRanges[[i]]$pAdj <- p.adjust( listGRanges[[i]]$pVal, method="BH") } ## ----------------------------------------------------------------------------- listGRanges[[1]][1,] ## ----------------------------------------------------------------------------- # Create a single GRanges obect with data for all samples and a dataframe for summaries combined_files <- Reduce(c,listGRanges) combined_files_df<-data.frame(combined_files) ## ----------------------------------------------------------------------------- # How many rows of data in combined GRange object? length(combined_files) ## ----------------------------------------------------------------------------- # Create table of unique positions and mu/theta values mu_theta <- unique(subset(combined_files_df, select=-c(x,n,pVal, isHypo, pAdj, effSize, sampleID))) ## ----------------------------------------------------------------------------- #Number of unique samples length(unique(combined_files_df$sampleID)) #Number of unique CpG sites nrow(unique(mu_theta)) ## ----------------------------------------------------------------------------- #generate summaries for mu and theta summary(mu_theta$mu) summary(mu_theta$theta) ## ----------------------------------------------------------------------------- # Create table of unique positions and depth/p-val/padj for each position in # each case depth_pvals_eff <- unique(combined_files_df) ## ----------------------------------------------------------------------------- #Summarize read depths summary(depth_pvals_eff$n) ## ----------------------------------------------------------------------------- #Summarize pvals summary(depth_pvals_eff$pVal) ## ----------------------------------------------------------------------------- #Summarize corrected pvals summary(depth_pvals_eff$pAdj) ## ----------------------------------------------------------------------------- #Summarize fraction of methylation summary(depth_pvals_eff$x/depth_pvals_eff$n) ## ----------------------------------------------------------------------------- #Summarize effect size summary(depth_pvals_eff$effSize) ## ----------------------------------------------------------------------------- # Detection of outliers based on number of significant sites # Count significant CpG sites per patient signif_only <- subset(combined_files_df, pVal <= 0.05) signif_counts <- dplyr::count(as.data.frame(signif_only),sampleID) # Calculate the percentiles of the number of significant sites sig_quantiles <- quantile(signif_counts$n, probs=c(0.025, 0.05, 0.95, 0.975, 0.999)) # Check if nay patients are above the 99.9th percentile subset(signif_counts, n >= sig_quantiles["99.9%"]) ## ----------------------------------------------------------------------------- # What is the most significant CpG site between all samples? subset(combined_files_df, pVal == min(combined_files$pVal)) ## ----eval=FALSE--------------------------------------------------------------- # if (!requireNamespace("BiocManager")) # install.packages("BiocManager") # BiocManager::install("annotatr") ## ----message=FALSE, results='hide', warning=FALSE----------------------------- #Assign approproate genome version based on alignments genome.version <- "hg19" # Select annnotation classes we want from annotatr (can be user-customized) myAnnots <- c('_genes_promoters', '_genes_5UTRs', '_genes_exons', '_genes_3UTRs','_cpg_islands') ## ----message=FALSE, results='hide', warning=FALSE----------------------------- #Read in patient 72 Grange data for annotation dmrs.gr<-subset(combined_files, sampleID == "vignette_borealis_patient_72") # Annotate using annotatr myAnnots <- paste(genome.version,myAnnots,sep="") annots.all.gr <- annotatr::build_annotations(genome = genome.version, annotations = myAnnots) allAnnot <- annotatr::annotate_regions(regions=dmrs.gr, annotations=annots.all.gr, ignore.strand=TRUE, minoverlap=0) ## ----------------------------------------------------------------------------- # Extract the annotated site with the smallest p-value subset(allAnnot, pVal == min(allAnnot$pVal))$annot ## ----sg, fig.wide=TRUE, fig.cap="plotCpGsite function demo"------------------- # Use Borealis plotting function to investigate this site further plotCpGsite("chr14:24780288", sampleOfInterest="patient_72", modelFile=file.path(outdir,"vignette_CpG_model_chr14.csv"), methCountFile=file.path(outdir, "vignette_CpG_model_rawMethCount_chr14.tsv"), totalCountFile=file.path(outdir, "vignette_CpG_model_rawTotalCount_chr14.tsv")) ## ----------------------------------------------------------------------------- padjThresh <- 0.05 ## ----------------------------------------------------------------------------- # Calculate how may CpGs per annotatr feature and store in dataframe allAnnot <- as.data.frame(allAnnot) featureids <- allAnnot$annot.id featurecnts <- as.data.frame(table(featureids)) colnames(featurecnts) <- c("annot.id", "NoSites") ## ----------------------------------------------------------------------------- head(featurecnts) ## ----------------------------------------------------------------------------- # Calculate how many sites per feature pass p-value threshold # Add data to new summary dataframe signifonly <- subset(allAnnot, pAdj<=padjThresh) signifonly <- signifonly$annot.id signifonlycnt <- as.data.frame(table(signifonly)) colnames(signifonlycnt) <- c("annot.id", "signifCount") featurecnts <- merge(featurecnts, signifonlycnt, by.x="annot.id", by.y="annot.id", all.x=TRUE) ## ----------------------------------------------------------------------------- # What fraction of sites per feature pass p-value threshold? featurecnts$fractionSignif <- featurecnts$signifCount/featurecnts$NoSites ## ----------------------------------------------------------------------------- # Let's combine the data for final output locations <- subset(allAnnot, select=c("annot.id", "annot.seqnames", "annot.start", "annot.end")) featurecnts <- merge(unique(locations), featurecnts, by="annot.id") genemap <- unique(cbind(allAnnot$annot.symbol, allAnnot$annot.id, allAnnot$annot.tx_id,allAnnot$annot.width, allAnnot$sampleID)) colnames(genemap) <- c("annot.symbol", "annot.id", "annot.tx_id", "annot.width", "SampleID") summarized <- merge(featurecnts, genemap, by="annot.id") summarized$signifCount[is.na(summarized$signifCount)] <- 0 summarized$fractionSignif[is.na(summarized$fractionSignif)] <- 0 ## ----------------------------------------------------------------------------- # Select the LTB4R promoter region subset(summarized, select=c(annot.symbol, NoSites, signifCount, fractionSignif), (annot.symbol=="LTB4R" & grepl("promoter", annot.id))) ## ----sessionInfo, echo=FALSE-------------------------------------------------- sessionInfo()