Contents

The material in this course requires R version 3.3 and Bioconductor version 3.4

stopifnot(
    getRversion() >= '3.3' && getRversion() < '3.4',
    BiocInstaller::biocVersion() == "3.4"
)

1 Annotation

1.1 Model organisms

1.1.1 Gene model annotation resources – TxDb packages

e.g., `{r Biocpkg(“TxDb.Hsapiens.UCSC.hg19.knownGene”)}

library(TxDb.Hsapiens.UCSC.hg19.knownGene)
txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
txdb
## TxDb object:
## # Db type: TxDb
## # Supporting package: GenomicFeatures
## # Data source: UCSC
## # Genome: hg19
## # Organism: Homo sapiens
## # Taxonomy ID: 9606
## # UCSC Table: knownGene
## # Resource URL: http://genome.ucsc.edu/
## # Type of Gene ID: Entrez Gene ID
## # Full dataset: yes
## # miRBase build ID: GRCh37
## # transcript_nrow: 82960
## # exon_nrow: 289969
## # cds_nrow: 237533
## # Db created by: GenomicFeatures package from Bioconductor
## # Creation time: 2015-10-07 18:11:28 +0000 (Wed, 07 Oct 2015)
## # GenomicFeatures version at creation time: 1.21.30
## # RSQLite version at creation time: 1.0.0
## # DBSCHEMAVERSION: 1.1
methods(class=class(txdb))
##  [1] $                      $<-                    ExpressionSet          annotatedDataFrameFrom
##  [5] as.list                asBED                  asGFF                  assayData             
##  [9] assayData<-            cds                    cdsBy                  cdsByOverlaps         
## [13] coerce                 columns                combine                contents              
## [17] dbInfo                 dbconn                 dbfile                 dbmeta                
## [21] dbschema               disjointExons          distance               exons                 
## [25] exonsBy                exonsByOverlaps        extractUpstreamSeqs    featureNames          
## [29] featureNames<-         fiveUTRsByTranscript   genes                  initialize            
## [33] intronsByTranscript    isActiveSeq            isActiveSeq<-          isNA                  
## [37] keys                   keytypes               locateVariants         mapIds                
## [41] mapIdsToRanges         mapRangesToIds         mapToTranscripts       mappedkeys            
## [45] metadata               microRNAs              nhit                   organism              
## [49] predictCoding          promoters              revmap                 sample                
## [53] sampleNames            sampleNames<-          saveDb                 select                
## [57] seqinfo                seqlevels0             seqlevels<-            show                  
## [61] species                storageMode            storageMode<-          summarizeVariants     
## [65] tRNAs                  taxonomyId             threeUTRsByTranscript  transcripts           
## [69] transcriptsBy          transcriptsByOverlaps  updateObject          
## see '?methods' for accessing help and source code

TxDb objects

Accessing gene models

  • exons(), transcripts(), genes(), cds() (coding sequence)
  • promoters() & friends
  • exonsBy() & friends – exons by gene, transcript, …
  • ‘select’ interface: keytypes(), columns(), keys(), select(), mapIds()
exons(txdb)
## GRanges object with 289969 ranges and 1 metadata column:
##                  seqnames         ranges strand |   exon_id
##                     <Rle>      <IRanges>  <Rle> | <integer>
##        [1]           chr1 [11874, 12227]      + |         1
##        [2]           chr1 [12595, 12721]      + |         2
##        [3]           chr1 [12613, 12721]      + |         3
##        [4]           chr1 [12646, 12697]      + |         4
##        [5]           chr1 [13221, 14409]      + |         5
##        ...            ...            ...    ... .       ...
##   [289965] chrUn_gl000241 [35706, 35859]      - |    289965
##   [289966] chrUn_gl000241 [36711, 36875]      - |    289966
##   [289967] chrUn_gl000243 [11501, 11530]      + |    289967
##   [289968] chrUn_gl000243 [13608, 13637]      + |    289968
##   [289969] chrUn_gl000247 [ 5787,  5816]      - |    289969
##   -------
##   seqinfo: 93 sequences (1 circular) from hg19 genome
exonsBy(txdb, "tx")
## GRangesList object of length 82960:
## $1 
## GRanges object with 3 ranges and 3 metadata columns:
##       seqnames         ranges strand |   exon_id   exon_name exon_rank
##          <Rle>      <IRanges>  <Rle> | <integer> <character> <integer>
##   [1]     chr1 [11874, 12227]      + |         1        <NA>         1
##   [2]     chr1 [12613, 12721]      + |         3        <NA>         2
##   [3]     chr1 [13221, 14409]      + |         5        <NA>         3
## 
## $2 
## GRanges object with 3 ranges and 3 metadata columns:
##       seqnames         ranges strand | exon_id exon_name exon_rank
##   [1]     chr1 [11874, 12227]      + |       1      <NA>         1
##   [2]     chr1 [12595, 12721]      + |       2      <NA>         2
##   [3]     chr1 [13403, 14409]      + |       6      <NA>         3
## 
## $3 
## GRanges object with 3 ranges and 3 metadata columns:
##       seqnames         ranges strand | exon_id exon_name exon_rank
##   [1]     chr1 [11874, 12227]      + |       1      <NA>         1
##   [2]     chr1 [12646, 12697]      + |       4      <NA>         2
##   [3]     chr1 [13221, 14409]      + |       5      <NA>         3
## 
## ...
## <82957 more elements>
## -------
## seqinfo: 93 sequences (1 circular) from hg19 genome

1.1.2 Whole-genome sequence – BSgenome packages

e.g., `{r Biocpkg(“BSgenome.Hsapiens.UCSC.hg19”)}

library(BSgenome.Hsapiens.UCSC.hg19)
genome <- BSgenome.Hsapiens.UCSC.hg19
getSeq(genome, exons(txdb)[1:100])
##   A DNAStringSet instance of length 100
##       width seq
##   [1]   354 CTTGCCGTCAGCCTTTTCTTTGACCTCTTCTTTCTGTTCATGT...AAAGATGAGTGAGAGCATCAACTTCTCTCACAACCTAGGCCA
##   [2]   127 GCTCCTGTCTCCCCCCAGGTGTGTGGTGATGCCAGGCATGCCC...GACTTGGATCACACTCTTGTGAGTGTCCCCAGTGTTGCAGAG
##   [3]   109 GTGTGTGGTGATGCCAGGCATGCCCTTCCCCAGCATCAGGTCT...GACTTGGATCACACTCTTGTGAGTGTCCCCAGTGTTGCAGAG
##   [4]    52 CATCAGGTCTCCAGAGCTGCAGAAGACGACGGCCGACTTGGATCACACTCTT
##   [5]  1189 GCAGGGCCATCAGGCACCAAAGGGATTCTGCCAGCATAGTGCT...ACAGCTAGAGATCCTTTATTAAAAGCACACTGTTGGTTTCTG
##   ...   ... ...
##  [96]   251 CCCGTCCCCGGCGCGGCCCGCGCGCTCCTCCGCCGCCTCTCGC...ATCCTCAACGTGGACCCGGTGCAGCACACGTACTCCTGCAAG
##  [97]   262 GTTCGGGTCTGGCGGTACTTGAAGGGCAAAGACCTGGTGGCCC...TCACCCTGCGGAACCTGGAGGAGGTGGAGTTCTGTGTGGAAG
##  [98]    48 ATAAACCCGGGACCCACTTCACTCCAGTGCCTCCGACGCCTCCTGATG
##  [99]   216 CGTGCCGGGGAATGCTGTGCGGCTTCGGCGCCGTGTGCGAGCC...GCCAGCAGCGCCGCATCCGCCTGCTCAGCCGCGGGCCGTGCG
## [100]   225 GCTCGCGGGACCCCTGCTCCAACGTGACCTGCAGCTTCGGCAG...CCCGCCAGGAGAATGTCTTCAAGAAGTTCGACGGCCCTTGTG

1.1.3 Identifier mapping – OrgDb

library(org.Hs.eg.db)
org.Hs.eg.db
## OrgDb object:
## | DBSCHEMAVERSION: 2.1
## | Db type: OrgDb
## | Supporting package: AnnotationDbi
## | DBSCHEMA: HUMAN_DB
## | ORGANISM: Homo sapiens
## | SPECIES: Human
## | EGSOURCEDATE: 2016-Mar14
## | EGSOURCENAME: Entrez Gene
## | EGSOURCEURL: ftp://ftp.ncbi.nlm.nih.gov/gene/DATA
## | CENTRALID: EG
## | TAXID: 9606
## | GOSOURCENAME: Gene Ontology
## | GOSOURCEURL: ftp://ftp.geneontology.org/pub/go/godatabase/archive/latest-lite/
## | GOSOURCEDATE: 20160305
## | GOEGSOURCEDATE: 2016-Mar14
## | GOEGSOURCENAME: Entrez Gene
## | GOEGSOURCEURL: ftp://ftp.ncbi.nlm.nih.gov/gene/DATA
## | KEGGSOURCENAME: KEGG GENOME
## | KEGGSOURCEURL: ftp://ftp.genome.jp/pub/kegg/genomes
## | KEGGSOURCEDATE: 2011-Mar15
## | GPSOURCENAME: UCSC Genome Bioinformatics (Homo sapiens)
## | GPSOURCEURL: ftp://hgdownload.cse.ucsc.edu/goldenPath/hg19
## | GPSOURCEDATE: 2010-Mar22
## | ENSOURCEDATE: 2016-Mar9
## | ENSOURCENAME: Ensembl
## | ENSOURCEURL: ftp://ftp.ensembl.org/pub/current_fasta
## | UPSOURCENAME: Uniprot
## | UPSOURCEURL: http://www.UniProt.org/
## | UPSOURCEDATE: Wed Mar 23 15:50:09 2016
## 
## Please see: help('select') for usage information

OrgDb objects

  • Curated resources, underlying sqlite data base, like TxDb
  • make your own: AnnotationForge (but see the AnnotationHub, below!)
  • ‘select’ interface: keytypes(), columns(), keys(), select(), mapIds()

select()

  • Vector of keys, desired columns
  • Specification of key type

    select(org.Hs.eg.db, c("BRCA1", "PTEN"), c("ENTREZID", "GENENAME"), "SYMBOL")
    ## 'select()' returned 1:1 mapping between keys and columns
    ##   SYMBOL ENTREZID                       GENENAME
    ## 1  BRCA1      672                breast cancer 1
    ## 2   PTEN     5728 phosphatase and tensin homolog
    keytypes(org.Hs.eg.db)
    ##  [1] "ACCNUM"       "ALIAS"        "ENSEMBL"      "ENSEMBLPROT"  "ENSEMBLTRANS" "ENTREZID"    
    ##  [7] "ENZYME"       "EVIDENCE"     "EVIDENCEALL"  "GENENAME"     "GO"           "GOALL"       
    ## [13] "IPI"          "MAP"          "OMIM"         "ONTOLOGY"     "ONTOLOGYALL"  "PATH"        
    ## [19] "PFAM"         "PMID"         "PROSITE"      "REFSEQ"       "SYMBOL"       "UCSCKG"      
    ## [25] "UNIGENE"      "UNIPROT"
    columns(org.Hs.eg.db)
    ##  [1] "ACCNUM"       "ALIAS"        "ENSEMBL"      "ENSEMBLPROT"  "ENSEMBLTRANS" "ENTREZID"    
    ##  [7] "ENZYME"       "EVIDENCE"     "EVIDENCEALL"  "GENENAME"     "GO"           "GOALL"       
    ## [13] "IPI"          "MAP"          "OMIM"         "ONTOLOGY"     "ONTOLOGYALL"  "PATH"        
    ## [19] "PFAM"         "PMID"         "PROSITE"      "REFSEQ"       "SYMBOL"       "UCSCKG"      
    ## [25] "UNIGENE"      "UNIPROT"

Related functionality

  • mapIds() – special case for mapping from 1 identifier to another
  • OrganismDb objects: combined org.*, TxDb.*, and other annotation resources for easy access

    library(Homo.sapiens)
    ## Loading required package: OrganismDbi
    ## Loading required package: GO.db
    ## 
    select(Homo.sapiens, c("BRCA1", "PTEN"), 
           c("TXNAME", "TXCHROM", "TXSTART", "TXEND"), 
           "SYMBOL")
    ## 'select()' returned 1:many mapping between keys and columns
    ##    SYMBOL     TXNAME TXCHROM  TXSTART    TXEND
    ## 1   BRCA1 uc010whl.2   chr17 41196312 41276132
    ## 2   BRCA1 uc002icp.4   chr17 41196312 41277340
    ## 3   BRCA1 uc010whm.2   chr17 41196312 41277340
    ## 4   BRCA1 uc002icu.3   chr17 41196312 41277468
    ## 5   BRCA1 uc010cyx.3   chr17 41196312 41277468
    ## 6   BRCA1 uc002icq.3   chr17 41196312 41277500
    ## 7   BRCA1 uc002ict.3   chr17 41196312 41277500
    ## 8   BRCA1 uc010whn.2   chr17 41196312 41277500
    ## 9   BRCA1 uc010who.3   chr17 41196312 41277500
    ## 10  BRCA1 uc010whp.2   chr17 41196312 41322420
    ## 11  BRCA1 uc010whq.1   chr17 41215350 41256973
    ## 12  BRCA1 uc002idc.1   chr17 41215350 41277468
    ## 13  BRCA1 uc010whr.1   chr17 41215350 41277468
    ## 14  BRCA1 uc002idd.3   chr17 41243117 41276132
    ## 15  BRCA1 uc002ide.1   chr17 41243452 41256973
    ## 16  BRCA1 uc010cyy.1   chr17 41243452 41277340
    ## 17  BRCA1 uc010whs.1   chr17 41243452 41277468
    ## 18  BRCA1 uc010cyz.2   chr17 41243452 41277500
    ## 19  BRCA1 uc010cza.2   chr17 41243452 41277500
    ## 20  BRCA1 uc010wht.1   chr17 41243452 41277500
    ## 21   PTEN uc001kfb.3   chr10 89623195 89728532
    ## 22   PTEN uc021pvw.1   chr10 89623195 89728532

1.2 Other annotation resources – biomaRt, AnnotationHub

1.2.1 biomaRt & friends

http://biomart.org; Bioconductor package biomaRt

## NEEDS INTERNET ACCESS !!
library(biomaRt)
head(listMarts(), 3)                      ## list marts
head(listDatasets(useMart("ensembl")), 3) ## mart datasets
ensembl <-                                ## fully specified mart
    useMart("ensembl", dataset = "hsapiens_gene_ensembl")

head(listFilters(ensembl), 3)             ## filters
myFilter <- "chromosome_name"
substr(filterOptions(myFilter, ensembl), 1, 50) ## return values
myValues <- c("21", "22")
head(listAttributes(ensembl), 3)          ## attributes
myAttributes <- c("ensembl_gene_id","chromosome_name")

## assemble and query the mart
res <- getBM(attributes =  myAttributes, filters =  myFilter,
             values =  myValues, mart = ensembl)

Other internet resources

1.2.2 AnnotationHub

  • Bioconductor package AnnotationHub
  • Meant to ease use of ‘consortium’ and other genome-scale resources
  • Simplify discovery, retrieval, local management, and import to standard Bioconductor representations

Example: Ensembl ‘GTF’ files to R / Bioconductor GRanges and TxDb

library(AnnotationHub)
hub <- AnnotationHub()
hub
query(hub, c("Ensembl", "80", "gtf"))
## ensgtf = display(hub)                   # visual choice
hub["AH47107"]
gtf <- hub[["AH47107"]]
gtf
txdb <- GenomicFeatures::makeTxDbFromGRanges(gtf)

Example: non-model organism OrgDb packages

library(AnnotationHub)
hub <- AnnotationHub()
query(hub, "OrgDb")

Example: Map Roadmap epigenomic marks to hg38

  • Roadmap BED file as GRanges

    library(AnnotationHub)
    hub <- AnnotationHub()
    query(hub , c("EpigenomeRoadMap", "E126", "H3K4ME2"))
    E126 <- hub[["AH29817"]]
  • UCSC ‘liftOver’ file to map coordinates

    query(hub , c("hg19", "hg38", "chainfile"))
    chain <- hub[["AH14150"]]
  • lift over – possibly one-to-many mapping, so GRanges to GRangesList

    library(rtracklayer)
    E126hg38 <- liftOver(E126, chain)
    E126hg38

2 Annotating Variants

Example: read variants from a VCF file, and annotate with respect to a known gene model

## input variants
library(VariantAnnotation)
fl <- system.file("extdata", "chr22.vcf.gz", package="VariantAnnotation")
vcf <- readVcf(fl, "hg19")
seqlevels(vcf) <- "chr22"
## known gene model
library(TxDb.Hsapiens.UCSC.hg19.knownGene)
coding <- locateVariants(rowRanges(vcf),
    TxDb.Hsapiens.UCSC.hg19.knownGene,
    CodingVariants())
head(coding)
## GRanges object with 6 ranges and 9 metadata columns:
##     seqnames               ranges strand | LOCATION  LOCSTART    LOCEND   QUERYID        TXID
##        <Rle>            <IRanges>  <Rle> | <factor> <integer> <integer> <integer> <character>
##   1    chr22 [50301422, 50301422]      - |   coding       939       939        24       75253
##   2    chr22 [50301476, 50301476]      - |   coding       885       885        25       75253
##   3    chr22 [50301488, 50301488]      - |   coding       873       873        26       75253
##   4    chr22 [50301494, 50301494]      - |   coding       867       867        27       75253
##   5    chr22 [50301584, 50301584]      - |   coding       777       777        28       75253
##   6    chr22 [50302962, 50302962]      - |   coding       698       698        57       75253
##             CDSID      GENEID       PRECEDEID        FOLLOWID
##     <IntegerList> <character> <CharacterList> <CharacterList>
##   1        218562       79087                                
##   2        218562       79087                                
##   3        218562       79087                                
##   4        218562       79087                                
##   5        218562       79087                                
##   6        218563       79087                                
##   -------
##   seqinfo: 1 sequence from an unspecified genome; no seqlengths

3 Resources

Acknowledgements

3.1 sessionInfo()

sessionInfo()
## R version 3.3.0 (2016-05-03)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 14.04.4 LTS
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C               LC_TIME=en_US.UTF-8       
##  [4] LC_COLLATE=C               LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                  LC_ADDRESS=C              
## [10] LC_TELEPHONE=C             LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
##  [1] grid      stats4    parallel  stats     graphics  grDevices utils     datasets  methods  
## [10] base     
## 
## other attached packages:
##  [1] Homo.sapiens_1.3.1                      GO.db_3.3.0                            
##  [3] OrganismDbi_1.15.1                      AnnotationHub_2.5.4                    
##  [5] Gviz_1.17.4                             biomaRt_2.29.2                         
##  [7] org.Hs.eg.db_3.3.0                      BiocParallel_1.7.4                     
##  [9] TxDb.Hsapiens.UCSC.hg19.knownGene_3.2.2 GenomicFeatures_1.25.14                
## [11] AnnotationDbi_1.35.3                    VariantAnnotation_1.19.2               
## [13] RNAseqData.HNRNPC.bam.chr14_0.11.0      GenomicAlignments_1.9.4                
## [15] Rsamtools_1.25.0                        SummarizedExperiment_1.3.5             
## [17] Biobase_2.33.0                          BSgenome.Hsapiens.UCSC.hg19_1.4.0      
## [19] BSgenome_1.41.2                         rtracklayer_1.33.7                     
## [21] GenomicRanges_1.25.8                    GenomeInfoDb_1.9.1                     
## [23] Biostrings_2.41.4                       XVector_0.13.2                         
## [25] IRanges_2.7.11                          S4Vectors_0.11.7                       
## [27] BiocGenerics_0.19.1                     ggplot2_2.1.0                          
## [29] BiocStyle_2.1.10                       
## 
## loaded via a namespace (and not attached):
##  [1] httr_1.2.0                    splines_3.3.0                 Formula_1.2-1                
##  [4] shiny_0.13.2                  interactiveDisplayBase_1.11.3 latticeExtra_0.6-28          
##  [7] RBGL_1.49.1                   yaml_2.1.13                   RSQLite_1.0.0                
## [10] lattice_0.20-33               biovizBase_1.21.0             chron_2.3-47                 
## [13] digest_0.6.9                  RColorBrewer_1.1-2            colorspace_1.2-6             
## [16] htmltools_0.3.5               httpuv_1.3.3                  Matrix_1.2-6                 
## [19] plyr_1.8.4                    XML_3.98-1.4                  zlibbioc_1.19.0              
## [22] xtable_1.8-2                  scales_0.4.0                  nnet_7.3-12                  
## [25] survival_2.39-4               magrittr_1.5                  mime_0.4                     
## [28] evaluate_0.9                  foreign_0.8-66                graph_1.51.0                 
## [31] BiocInstaller_1.23.4          tools_3.3.0                   data.table_1.9.6             
## [34] formatR_1.4                   matrixStats_0.50.2            stringr_1.0.0                
## [37] munsell_0.4.3                 cluster_2.0.4                 ensembldb_1.5.8              
## [40] RCurl_1.95-4.8                dichromat_2.0-0               bitops_1.0-6                 
## [43] labeling_0.3                  rmarkdown_0.9.6               gtable_0.2.0                 
## [46] codetools_0.2-14              DBI_0.4-1                     reshape2_1.4.1               
## [49] R6_2.1.2                      gridExtra_2.2.1               knitr_1.13                   
## [52] Hmisc_3.17-4                  stringi_1.1.1                 Rcpp_0.12.5                  
## [55] rpart_4.1-10                  acepack_1.3-3.3