library(IntermediateSequenceAnalysis2013)

library(org.Hs.eg.db)
library(TxDb.Hsapiens.UCSC.hg19.knownGene)
library(BSgenome.Hsapiens.UCSC.hg19)
library(seqnames.db)
library(gmapR)
library(VariantTools)
library(lattice)
library(MotifDb)
library(parallel)

## configuration

options(mc.cores=detectCores())
path <- "~/SequenceData/TERT/bam"
mdb <- MotifDb
seqnameStyle(BSgenome.Hsapiens.UCSC.hg19) <- 
    seqnameStyle(TxDb.Hsapiens.UCSC.hg19.knownGene) <- "NCBI"

bsgenome <- BSgenome.Hsapiens.UCSC.hg19
txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
geneid <- AnnotationDbi::select(org.Hs.eg.db, "TERT", "ENTREZID", "SYMBOL")

## create a mini-genome, chr5 only

roi <- as(seqinfo(BSgenome.Hsapiens.UCSC.hg19),
          "GRanges")
seqlevels(roi, force=TRUE) <- "5"       # drop unused
chr5seq <- getSeq(bsgenome, roi, as.character=FALSE)
names(chr5seq) <- "5" ## must match seqs in Bam
genome.5 <- GmapGenome(chr5seq, name="hg19_5", create=TRUE)

## call the variants

fls <- dir(path, pattern="*bam$", full=TRUE)

rng <- range(transcriptsBy(txdb, "gene")[[geneid$ENTREZID]])
pregion <- promoters(rng, upstream=330, downstream=0)
seqlevels(pregion) <- "5"
vtparam <- VariantTallyParam(genome.5, readlen=101L, which=pregion,
                             indels=TRUE)

called <- mclapply(fls, callVariants, tally.param=vtparam)
len <- elementLengths(called)
called <- do.call(c, called)

id <- sub(".TERT.bam", "", basename(fls))
called$id <- factor(rep(id, len), levels=unique(id))

## make a levelplot of the variants

alt.counts <-
    xtabs(cycleCount.10.91 / count.total ~ start(called) + id,
          mcols(called))

levelplot(alt.counts, xlab="Position", ylab=NULL,
          scales=list(x=list(rot=45)), aspect="fill",
          col.regions=rev(gray.colors(100, 0, 1)))

# obtain reference and mutant sequences with which to query jaspar human pfms:
# seq.ref: from the hg19 reference geneome
# seq.mut: the consensus of the pfm inferred from the called variants

snp <- GRanges("5", IRanges(1295228, width=1))
snp <- flank(snp, 10, both=TRUE)

ref.seq <- DNAString(getSeq(genome.5, snp))

## ref.seq <- reverseComplement(DNAString(seq.ref))

alt.seq <- variantSequences(called, snp, genome.5)
alt.consensus <- DNAString(consensusString(alt.seq))


# here's what they look like
# seq.ref: GCCCAGCCCCCTCCGGGCCCT
# seq.mut: GCCCAGCCCCTTCCGGGCCCT

idx <- with(mcols(mdb), organism=="Hsapiens" & dataSource == "JASPAR_CORE")
jaspar.human.pwms <- mdb[idx]

   # huang et al claim:
   #  Both C228T and C250T generated an identical 11-bp nucleotide stretch
   #  (5′-CCCCTTCCGGG-3′) containing a consensus binding site for
   #  E-twenty- six (ETS) transcription factors (GGAA, reverse com-
   #  plement) within the TERT promoter region. Because ETS transcription
   #  factors may become activated through dysregulation of
   #  mitogen-activated protein kinase (MAP kinase) signaling, we
   #  hypothesized that these promoter mutations might augment gene
   #  expression.
   #
   # C250T does not appear in the bam files I chose.

minScore = "90%"    # choose a high minimum matching score: only one
                    # base has changed
(ref.hits <- matchPWMs(jaspar.human.pwms, reverseComplement(ref.seq),
                       minScore))
(alt.hits <- matchPWMs(jaspar.human.pwms, reverseComplement(alt.consensus),
                       minScore))
