## ----setup, echo=FALSE--------------------------------------------------------
suppressPackageStartupMessages({ 
    library(CNVRanger)
    library(AnnotationHub)
    library(regioneR)
    library(BSgenome.Btaurus.UCSC.bosTau6.masked)
    library(SummarizedExperiment)
    library(curatedTCGAData)
    library(TCGAutils)
    library(Gviz)
})

## ----oview, echo=FALSE, fig.wide=TRUE-----------------------------------------
vign.dir <- system.file("vignettes", package = "CNVRanger")
knitr::include_graphics(file.path(vign.dir, "CNVRanger.png"))

## ----input, echo=FALSE, fig.wide=TRUE-----------------------------------------
vign.dir <- system.file("vignettes", package = "CNVRanger")
knitr::include_graphics(file.path(vign.dir, "Input.png"))

## ----lib----------------------------------------------------------------------
library(CNVRanger)

## ----readCalls----------------------------------------------------------------
data.dir <- system.file("extdata", package="CNVRanger")
call.file <- file.path(data.dir, "Silva16_PONE_CNV_calls.csv")
calls <- read.csv(call.file, as.is=TRUE)
nrow(calls)
head(calls)

## ----nrSamples----------------------------------------------------------------
length(unique(calls[,"NE_id"]))

## ----cnvCalls-----------------------------------------------------------------
grl <- GenomicRanges::makeGRangesListFromDataFrame(calls, 
    split.field="NE_id", keep.extra.columns=TRUE)
grl

## ----sortCalls----------------------------------------------------------------
grl <- GenomicRanges::sort(grl)
grl

## ----RaggedExperiment---------------------------------------------------------
ra <- RaggedExperiment::RaggedExperiment(grl)
ra

## ----RaggedExperiment-assay---------------------------------------------------
RaggedExperiment::assay(ra[1:5,1:5])

## ----RaggedExperiment-colData-------------------------------------------------
weight <- rnorm(ncol(ra), mean=1100, sd=100)
fcr <- rnorm(ncol(ra), mean=6, sd=1.5)
RaggedExperiment::colData(ra)$weight <- round(weight, digits=2)
RaggedExperiment::colData(ra)$fcr <- round(fcr, digits=2)
RaggedExperiment::colData(ra)

## ----summarization, echo=FALSE, out.width = "40%", out.extra='style="float:right; padding:10px"'----
vign.dir <- system.file("vignettes", package = "CNVRanger")
knitr::include_graphics(file.path(vign.dir, "Summarization.png"))

## ----cnvrs--------------------------------------------------------------------
cnvrs <- populationRanges(grl, density=0.1)
cnvrs

## ----cnvrsRO------------------------------------------------------------------
ro.cnvrs <- populationRanges(grl[1:100], mode="RO", ro.thresh=0.51)
ro.cnvrs

## ----gistic-------------------------------------------------------------------
cnvrs <- populationRanges(grl, density=0.1, est.recur=TRUE)
cnvrs

## ----recurr-------------------------------------------------------------------
subset(cnvrs, pvalue < 0.05)

## ----recurrViz----------------------------------------------------------------
plotRecurrentRegions(cnvrs, genome="bosTau6", chr="chr1")

## ----overlap, echo=FALSE, out.width = "40%", out.extra='style="float:right; padding:10px"'----
vign.dir <- system.file("vignettes", package = "CNVRanger")
knitr::include_graphics(file.path(vign.dir, "Overlap.png"))

## ----getBtGenes---------------------------------------------------------------
library(AnnotationHub)
ah <- AnnotationHub::AnnotationHub()
ahDb <- AnnotationHub::query(ah, pattern = c("Bos taurus", "EnsDb"))
ahDb

## ----getBtGenes2--------------------------------------------------------------
ahEdb <- ahDb[["AH60948"]]
bt.genes <- ensembldb::genes(ahEdb)
GenomeInfoDb::seqlevelsStyle(bt.genes) <- "UCSC"
bt.genes

## ----formatBtGenes------------------------------------------------------------
sel.genes <- subset(bt.genes, seqnames %in% paste0("chr", 1:2))
sel.genes <- subset(sel.genes, gene_biotype == "protein_coding")
sel.cnvrs <- subset(cnvrs, seqnames %in% paste0("chr", 1:2))

## ----findOlaps----------------------------------------------------------------
olaps <- GenomicRanges::findOverlaps(sel.genes, sel.cnvrs, ignore.strand=TRUE)
qh <- S4Vectors::queryHits(olaps)
sh <- S4Vectors::subjectHits(olaps)
cgenes <- sel.genes[qh]
cgenes$type <- sel.cnvrs$type[sh]
subset(cgenes, select = "type")

## ----vizOlaps-----------------------------------------------------------------
cnvOncoPrint(grl, cgenes)

## ----ovlpTest, warning=FALSE--------------------------------------------------
library(regioneR)
library(BSgenome.Btaurus.UCSC.bosTau6.masked)
res <- regioneR::overlapPermTest(A=sel.cnvrs, B=sel.genes, ntimes=100, 
    genome="bosTau6", mask=NA, per.chromosome=TRUE, count.once=TRUE)
res

## ----permDist-----------------------------------------------------------------
summary(res[[1]]$permuted)

## ----vizPermTest--------------------------------------------------------------
plot(res)

## ----expression, echo=FALSE, out.width = "40%", out.extra='style="float:right; padding:10px"'----
vign.dir <- system.file("vignettes", package = "CNVRanger")
knitr::include_graphics(file.path(vign.dir, "Expression.png"))

## ----rseqdata-----------------------------------------------------------------
rseq.file <- file.path(data.dir, "counts_cleaned.txt")
rcounts <- read.delim(rseq.file, row.names=1, stringsAsFactors=FALSE)
rcounts <- as.matrix(rcounts)
dim(rcounts)
rcounts[1:5, 1:5]

## ----rse----------------------------------------------------------------------
library(SummarizedExperiment)
rranges <- GenomicRanges::granges(sel.genes)[rownames(rcounts)]
rse <- SummarizedExperiment(assays=list(rcounts=rcounts), rowRanges=rranges)
rse

## ----cnvEQTL------------------------------------------------------------------
res <- cnvEQTL(cnvrs, grl, rse, window = "1Mbp", verbose = TRUE)
res

## ----tcgaSetup, message=FALSE-------------------------------------------------
library(curatedTCGAData)
gbm <- curatedTCGAData::curatedTCGAData("GBM",
        assays=c("GISTIC_Peaks", "CNVSNP", "RNASeq2GeneNorm"),
        version = "1.1.38",
        dry.run=FALSE)
gbm

## ----tcgaGeneAnno, message=FALSE----------------------------------------------
library(TCGAutils)
gbm <- TCGAutils::symbolsToRanges(gbm, unmapped=FALSE)
for(i in 1:3) 
{
    rr <- rowRanges(gbm[[i]])
    GenomeInfoDb::genome(rr) <- "NCBI37"
    GenomeInfoDb::seqlevelsStyle(rr) <- "UCSC"
    ind <- as.character(seqnames(rr)) %in% c("chr1", "chr2")
    rowRanges(gbm[[i]]) <- rr
    gbm[[i]] <- gbm[[i]][ind,]
}
gbm

## ----gbmIntersect-------------------------------------------------------------
gbm <- MultiAssayExperiment::intersectColumns(gbm)
TCGAutils::sampleTables(gbm)
data(sampleTypes, package="TCGAutils")
sampleTypes
gbm <- TCGAutils::TCGAsplitAssays(gbm, sampleCodes="01")
gbm

## ----segmean------------------------------------------------------------------
ind <- grep("CNVSNP", names(gbm))
head( mcols(gbm[[ind]]) )
summary( mcols(gbm[[ind]])$Segment_Mean )

## ----lr2int, eval=FALSE-------------------------------------------------------
#  round( (2^lr) * 2)

## ----transformToStates--------------------------------------------------------
smean <- mcols(gbm[[ind]])$Segment_Mean
state <- round(2^smean * 2)
state[state > 4] <- 4
mcols(gbm[[ind]])$state <- state
gbm[[ind]] <- gbm[[ind]][state != 2,]
mcols(gbm[[ind]]) <- mcols(gbm[[ind]])[,3:1]
table(mcols(gbm[[ind]])$state)

## ----gbmEQTL------------------------------------------------------------------
res <- cnvEQTL(cnvrs="01_GBM_GISTIC_Peaks-20160128", 
    calls="01_GBM_CNVSNP-20160128", 
    rcounts="01_GBM_RNASeq2GeneNorm-20160128_ranged", 
    data=gbm, window="1Mbp", verbose=TRUE)
res

## ----plotEQTL-----------------------------------------------------------------
res[2]
(r <- GRanges(names(res)[2]))
plotEQTL(cnvr=r, genes=res[[2]], genome="hg19", cn="CN1")

## ----phenotype, echo=FALSE, out.width = "40%", out.extra='style="float:right; padding:10px"'----
vign.dir <- system.file("vignettes", package = "CNVRanger")
knitr::include_graphics(file.path(vign.dir, "Phenotype.png"))

## ----readCallsGWAS------------------------------------------------------------
cnv.loc <- file.path(data.dir, "CNVOut.txt") 
cnv.calls <- read.delim(cnv.loc, as.is=TRUE) 
head(cnv.calls)

## ----calls2grl----------------------------------------------------------------
cnv.calls <- GenomicRanges::makeGRangesListFromDataFrame(cnv.calls, 
    split.field="sample.id", keep.extra.columns=TRUE)
sort(cnv.calls)

## ----phenoData----------------------------------------------------------------
phen.loc <- file.path(data.dir, "Pheno.txt")
colData <- read.delim(phen.loc, as.is=TRUE)
head(colData)

## ----importPhen---------------------------------------------------------------
mcols(cnv.calls) <- colData
re <- RaggedExperiment::RaggedExperiment(cnv.calls)
re

## ----map----------------------------------------------------------------------
map.loc <- file.path(data.dir, "MapPenn.txt")
map.df <- read.delim(map.loc, as.is=TRUE)
head(map.df)

## ----importPhenRagged---------------------------------------------------------
phen.info <- setupCnvGWAS("example", cnv.out.loc=re, map.loc=map.loc)
phen.info

## ----Wdir---------------------------------------------------------------------
all.paths <- phen.info$all.paths
all.paths

## ----CNVGWASNames-------------------------------------------------------------
# Define chr correspondence to numeric
chr.code.name <- data.frame(   
                    V1 = c(16, 25, 29:31), 
                    V2 = c("1A", "4A", "25LG1", "25LG2", "LGE22"))

## ----CNVGWA-------------------------------------------------------------------
segs.pvalue.gr <- cnvGWAS(phen.info, chr.code.name=chr.code.name, method.m.test="none")
segs.pvalue.gr

## ----echo=FALSE, fig.cap="Definition of CNV segments. The figure shows construction of a CNV segment from 4 individual CNV calls in a CNV region based on pairwise copy number concordance between adjacent probes.", out.width = '100%'----
knitr::include_graphics(file.path(vign.dir, "CNVsegments.png"))

## ----manh---------------------------------------------------------------------
## Define the chromosome order in the plot
order.chrs <- c(1:24, "25LG1", "25LG2", 27:28, "LGE22", "1A", "4A")

## Chromosome sizes
chr.size.file <- file.path(data.dir, "Parus_major_chr_sizes.txt")
chr.sizes <- scan(chr.size.file)
chr.size.order <- data.frame(chr=order.chrs, sizes=chr.sizes, stringsAsFactors=FALSE)

## Plot a pdf file with a manhatthan within the 'Results' workfolder
plotManhattan(all.paths, segs.pvalue.gr, chr.size.order, plot.pdf=FALSE)

## ----prodGDS------------------------------------------------------------------
## Create a GDS file in disk and export the SNP probe positions
probes.cnv.gr <- generateGDS(phen.info, chr.code.name=chr.code.name)
probes.cnv.gr

## Run GWAS with existent GDS file
segs.pvalue.gr <- cnvGWAS(phen.info, chr.code.name=chr.code.name, produce.gds=FALSE)

## ----importLRR----------------------------------------------------------------
# List files to import LRR/BAF 
files <- list.files(data.dir, pattern = "\\.cnv.txt.adjusted$")
samples <- sub(".cnv.txt.adjusted$", "", files)
samples <- sub("^GT","", samples)
sample.files <- data.frame(file.names=files, sample.names=samples)
 
# All missing samples will have LRR = '0' and BAF = '0.5' in all SNPs listed in the GDS file
importLrrBaf(all.paths, data.dir, sample.files, verbose=FALSE)

# Read the GDS to check if the LRR/BAF nodes were added
cnv.gds <- file.path(all.paths[1], "CNV.gds")
(genofile <- SNPRelate::snpgdsOpen(cnv.gds, allow.fork=TRUE, readonly=FALSE))
gdsfmt::closefn.gds(genofile)

# Run the CNV-GWAS with existent GDS
segs.pvalue.gr <- cnvGWAS(phen.info, chr.code.name=chr.code.name, produce.gds=FALSE, run.lrr=TRUE)

## ----sessionInfo--------------------------------------------------------------
sessionInfo()