## ---- message=FALSE------------------------------------------------------ library(Homo.sapiens) columns(Homo.sapiens) ## ------------------------------------------------------------------------ g <- list() g[["USF2"]] <- select(Homo.sapiens, "USF2", c("TXID","TXNAME"), "SYMBOL") g[["CHPF"]] <- select(Homo.sapiens, "CHPF", c("TXID","TXNAME"), "SYMBOL") g ## ------------------------------------------------------------------------ library(TxDb.Hsapiens.UCSC.hg19.knownGene) txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene ebt <- exonsBy(txdb, by="tx") head(names(ebt)) usf2 <- ebt[ g[["USF2"]]$TXID ] chpf <- ebt[ g[["CHPF"]]$TXID ] ## ------------------------------------------------------------------------ usf2 chpf ## ------------------------------------------------------------------------ width(usf2) sum(width(usf2)) ## ------------------------------------------------------------------------ usf2.r <- range(unlist(usf2)) chpf.r <- range(unlist(chpf)) ## ------------------------------------------------------------------------ library(rafalib) lens <- length(usf2) nullplot(start(usf2.r),end(usf2.r),0,2) segments(start(usf2[[1]]), rep(1,lens[1]), end(usf2[[1]]), rep(1,lens[1]), lwd=3) ## ------------------------------------------------------------------------ plotGRangesList <- function(x,name="") { r <- range(unlist(range(x))) nullplot(start(r),end(r),0.5,length(x)+0.5, main=name,xlab=seqnames(x[[1]][1])) lens <- elementNROWS(x) for (i in seq_along(x)) { segments(start(x[[i]]), rep(i,lens[i]), end(x[[i]]), rep(i,lens[i]), lwd=3) } } plotGRangesList(usf2, "USF2") plotGRangesList(chpf, "CHPF") ## ------------------------------------------------------------------------ dir <- system.file("extdata", package="bioc2016eda") samples <- read.csv(file.path(dir,"samples.csv")) samples ## ---- message=FALSE------------------------------------------------------ library(GenomicAlignments) bamfiles <- file.path(dir,paste0(samples$run,".bam")) file.exists(bamfiles) ## ------------------------------------------------------------------------ for (i in seq_along(bamfiles)) { sortBam(bamfiles[i], destination=file.path(dir,paste0(samples$run[i],"_sort"))) indexBam(file.path(dir,paste0(samples$run[i],"_sort.bam"))) } bamfiles <- file.path(dir,paste0(samples$run,"_sort.bam")) file.exists(bamfiles) ## ------------------------------------------------------------------------ usf2.r <- keepStandardChromosomes(usf2.r) gap <- readGAlignmentPairs(bamfiles[1], param=ScanBamParam(which=usf2.r)) gap ## ------------------------------------------------------------------------ cov <- coverage(gap) cov.genome <- as.integer(cov[usf2.r][["chr19"]]) pos.genome <- seq(from=start(usf2.r),to=end(usf2.r)) plot(pos.genome, cov.genome, xlab="position (genome)", ylab="coverage") ## ------------------------------------------------------------------------ fco <- findCompatibleOverlaps(gap, usf2) fco countSubjectHits(fco) ## ------------------------------------------------------------------------ tab <- table(queryHits(fco), subjectHits(fco)) tab <- as.matrix(tab) frags <- apply(tab, 1, paste, collapse="-") frags[1] ## ------------------------------------------------------------------------ table(frags) sum(table(frags)) length(unique(queryHits(fco))) ## ------------------------------------------------------------------------ idx <- queryHits(fco)[subjectHits(fco) == 3] gr <- as(gap[idx],"GRanges") strand(gr) <- "*" m2tx.start <- mapToTranscripts(resize(gr, width=1, fix="start"), usf2[3]) m2tx.start ## ------------------------------------------------------------------------ m2tx.end <- mapToTranscripts(resize(gr, width=1, fix="end"), usf2[3]) m2tx <- GRanges(seqnames(m2tx.start), IRanges(start(m2tx.start),start(m2tx.end))) m2tx ## ------------------------------------------------------------------------ readTxFragments <- function(file, transcript) { r <- range(transcript[[1]]) r <- keepStandardChromosomes(r) # suppress warnings about alignments with ambiguous pairing suppressWarnings({ gap <- readGAlignmentPairs(file, param=ScanBamParam(which=r)) }) fco <- findCompatibleOverlaps(gap, transcript) idx <- queryHits(fco) gr <- as(gap[idx],"GRanges") strand(gr) <- "*" m2tx.start <- mapToTranscripts(resize(gr, width=1, fix="start"), transcript) m2tx.end <- mapToTranscripts(resize(gr, width=1, fix="end"), transcript) tx.strand <- as.character(strand(transcript)[[1]][1]) if (tx.strand == "+") { m2tx <- GRanges(seqnames(m2tx.start), IRanges(start(m2tx.start),start(m2tx.end))) } else { m2tx <- GRanges(seqnames(m2tx.start), IRanges(start(m2tx.end),start(m2tx.start))) } m2tx } ## ------------------------------------------------------------------------ mypar() nullplot(0,1600,0,900,xlab="position (tx)",ylab="coverage") for (i in seq_along(bamfiles)) { frags <- readTxFragments(bamfiles[i], usf2[3]) cov.tx <- coverage(frags) lines(as.integer(cov.tx[[1]]), col=samples$center[i], lwd=2) } ## ------------------------------------------------------------------------ mypar() nullplot(0,3100,0,150,xlab="position (tx)",ylab="coverage") for (i in seq_along(bamfiles)) { frags <- readTxFragments(bamfiles[i], chpf[2]) cov.tx <- coverage(frags) lines(as.integer(cov.tx[[1]]), col=samples$center[i], lwd=2) } ## ------------------------------------------------------------------------ frags <- readTxFragments(bamfiles[1], usf2[3]) hist(width(frags), col="grey", border="white") ## ------------------------------------------------------------------------ mypar(4,1,mar=c(2,2,1,1)) for (f in bamfiles) { frags <- readTxFragments(f, usf2[3]) plot(start(frags), width(frags), xlim=c(200,1300), ylim=c(50,400), pch=15, col=rgb(0,0,0,.2), cex=.5, xlab="", ylab="") } ## ------------------------------------------------------------------------ mypar(4,1,mar=c(2,2,1,1)) for (f in bamfiles) { frags <- readTxFragments(f, chpf[2]) plot(start(frags), width(frags), xlim=c(0,3000), ylim=c(50,400), pch=15, col=rgb(0,0,0,.2), cex=.5, xlab="", ylab="") } ## ---- message=FALSE------------------------------------------------------ library(BSgenome.Hsapiens.UCSC.hg19) usf2.seq <- extractTranscriptSeqs(Hsapiens, usf2[3])[[1]] chpf.seq <- extractTranscriptSeqs(Hsapiens, chpf[2])[[1]] usf2.seq chpf.seq ## ------------------------------------------------------------------------ frags <- readTxFragments(bamfiles[1], usf2[3]) length(frags) / 4^3 ## ------------------------------------------------------------------------ start.seq <- as(Views(usf2.seq, start=start(frags), width=3), "DNAStringSet") seq.tab <- table(start.seq) sort(seq.tab, decreasing=TRUE) ## ------------------------------------------------------------------------ end.seq <- as(Views(usf2.seq, end=end(frags), width=3), "DNAStringSet") end.seq <- reverseComplement(end.seq) seq.tab <- table(end.seq) sort(seq.tab, decreasing=TRUE) ## ------------------------------------------------------------------------ frag.seq <- as(Views(usf2.seq, start=start(frags), end(frags)), "DNAStringSet") ## ------------------------------------------------------------------------ frag.gc <- letterFrequency(frag.seq, "GC", as.prob=TRUE) plot(density(frag.gc)) ## ------------------------------------------------------------------------ uniform.gc <- letterFrequencyInSlidingView(usf2.seq, median(width(frags)), "GC", as.prob=TRUE) plot(density(uniform.gc)) ## ------------------------------------------------------------------------ plot(density(uniform.gc), ylim=c(0,12), lwd=2, xlab="fragment GC content", main="") for (i in seq_along(bamfiles)) { frags <- readTxFragments(bamfiles[i], usf2[3]) frag.seq <- as(Views(usf2.seq, start=start(frags), end(frags)), "DNAStringSet") frag.gc <- letterFrequency(frag.seq, "GC", as.prob=TRUE) lines(density(frag.gc), col=samples$center[i], lwd=2) } ## ------------------------------------------------------------------------ sessionInfo()