1 ChipDb/OrgDb questions

What is the central key for each of these AnnoDb objects? Can you print out the first 10 keys for each?

ChipDb central key are the probe ids by default. The OrgDb org.Hs.eg.db is an entrez orgDb (eg in name) and therefore the default is Entrez Gene Id. To verify we print out the first 10 keys

head(keys(hugene20sttranscriptcluster.db), 10)

##  [1] "16650001" "16650003" "16650005" "16650007" "16650009" "16650011"
##  [7] "16650013" "16650015" "16650017" "16650019"

head(keys(org.Hs.eg.db), 10)

##  [1] "1"  "2"  "3"  "9"  "10" "11" "12" "13" "14" "15"

Can all of the available columns be used as keytypes for each of these?

keytypes(org.Hs.eg.db)

##  [1] "ACCNUM"       "ALIAS"        "ENSEMBL"      "ENSEMBLPROT" 
##  [5] "ENSEMBLTRANS" "ENTREZID"     "ENZYME"       "EVIDENCE"    
##  [9] "EVIDENCEALL"  "GENENAME"     "GO"           "GOALL"       
## [13] "IPI"          "MAP"          "OMIM"         "ONTOLOGY"    
## [17] "ONTOLOGYALL"  "PATH"         "PFAM"         "PMID"        
## [21] "PROSITE"      "REFSEQ"       "SYMBOL"       "UCSCKG"      
## [25] "UNIGENE"      "UNIPROT"

columns(org.Hs.eg.db)

##  [1] "ACCNUM"       "ALIAS"        "ENSEMBL"      "ENSEMBLPROT" 
##  [5] "ENSEMBLTRANS" "ENTREZID"     "ENZYME"       "EVIDENCE"    
##  [9] "EVIDENCEALL"  "GENENAME"     "GO"           "GOALL"       
## [13] "IPI"          "MAP"          "OMIM"         "ONTOLOGY"    
## [17] "ONTOLOGYALL"  "PATH"         "PFAM"         "PMID"        
## [21] "PROSITE"      "REFSEQ"       "SYMBOL"       "UCSCKG"      
## [25] "UNIGENE"      "UNIPROT"

setdiff(columns(hugene20sttranscriptcluster.db), keytypes(hugene20sttranscriptcluster.db))

## character(0)

# so in this case yes

What gene symbol corresponds to Entrez Gene ID 1000?

select(org.Hs.eg.db, "1000","SYMBOL")

## 'select()' returned 1:1 mapping between keys and columns

##   ENTREZID SYMBOL
## 1     1000   CDH2

select(hugene20sttranscriptcluster.db, "1000","SYMBOL", "ENTREZID")

## 'select()' returned 1:1 mapping between keys and columns

##   ENTREZID SYMBOL
## 1     1000   CDH2

What is the Ensembl Gene ID for PPARG?

select(org.Hs.eg.db, "PPARG", "ENSEMBL", "SYMBOL")

## 'select()' returned 1:1 mapping between keys and columns

##   SYMBOL         ENSEMBL
## 1  PPARG ENSG00000132170

What is the UniProt ID for GAPDH?

select(org.Hs.eg.db, "GAPDH", "UNIPROT","SYMBOL")

## 'select()' returned 1:many mapping between keys and columns

##   SYMBOL UNIPROT
## 1  GAPDH  P04406
## 2  GAPDH  V9HVZ4

How many of the probeset ids from the ChipDb map to a single gene? How many don’t map to a gene at all?

z <- mapIds(hugene20sttranscriptcluster.db, keys(hugene20sttranscriptcluster.db),
            "ENTREZID","PROBEID", multiVals = "list")

## 'select()' returned 1:many mapping between keys and columns

#or 

z <- mapIds(hugene20sttranscriptcluster.db, keys(hugene20sttranscriptcluster.db),
            "SYMBOL","PROBEID", multiVals = "list")

## 'select()' returned 1:many mapping between keys and columns

table(sapply(z, function(x) length(x[!is.na(x)])))

## 
##     0     1     2     3     4     5     6     7     8     9    10    11 
## 21909 29422  1514   350   135    78    51    19    20    29     4    11 
##    12    13    14    15    18    20    21    22    25    26    28    29 
##    13    14     1     1     3     1     5     6    14     2     2     3 
##    31    34    35 
##     2     1     7

Get all the gene alias for a given Entrez Gene ID displayed as a CharacterList. What are the alias for Entrez Gene ID 1000?

z <- mapIds(org.Hs.eg.db, keys(org.Hs.eg.db), "ALIAS", "ENTREZID", multiVals="CharacterList")

## 'select()' returned 1:many mapping between keys and columns

z[["1000"]]

## [1] "CD325"  "CDHN"   "CDw325" "NCAD"   "CDH2"

# or

select(org.Hs.eg.db, "1000", "ALIAS")

## 'select()' returned 1:many mapping between keys and columns

##   ENTREZID  ALIAS
## 1     1000  CD325
## 2     1000   CDHN
## 3     1000 CDw325
## 4     1000   NCAD
## 5     1000   CDH2

2 TxDb/EnsDb

How many Bioconductor TxDb packages are there?

# look on the website  or
length(BiocManager::available("TxDb"))

## [1] 34

length(BiocManager::available("EnsDb"))

## [1] 8

What information can be inferred from the name? What about for TxDb.Ggallus.UCSC.galGal4.refGene, TxDb.Rnorvegicus.BioMart.igis, EnsDb.Rnorvegicus.v79?

The name infers the species, source, build, and table used to generate the species.

Resource	Species	Source	Build	Table
TxDb.Hsapiens.UCSC.hg19.knownGene	Homo sapiens	UCSC	hg19 (GRCh37)	knownGene
TxDb.Ggallus.UCSC.galGal4.refGene	Gallus gallus	UCSC	galGal4	refGene
TxDb.Rnorvegicus.BioMart.igis	Rattus norvegicus	Biomart	igis
EnsDb.Rnorvegicus.v79	Rattus norvegicus	Ensembl	v79

What do you think is the central key for TxDb objects? What other keytypes can be used? Can all of the available columns be used as keytypes?

head(keys(TxDb.Hsapiens.UCSC.hg19.knownGene))

## [1] "1"         "10"        "100"       "1000"      "10000"     "100008586"

keytypes(TxDb.Hsapiens.UCSC.hg19.knownGene)

## [1] "CDSID"    "CDSNAME"  "EXONID"   "EXONNAME" "GENEID"   "TXID"    
## [7] "TXNAME"

# central key is TXID 

setdiff(columns(TxDb.Hsapiens.UCSC.hg19.knownGene), keytypes(TxDb.Hsapiens.UCSC.hg19.knownGene))

##  [1] "CDSCHROM"   "CDSEND"     "CDSSTART"   "CDSSTRAND"  "EXONCHROM" 
##  [6] "EXONEND"    "EXONRANK"   "EXONSTART"  "EXONSTRAND" "TXCHROM"   
## [11] "TXEND"      "TXSTART"    "TXSTRAND"   "TXTYPE"

# These are columns available that are NOT available as keytypes... 
# so No not all columns can be used as keys

List all the genomic positions of the transcripts by gene.

transcriptsBy(TxDb.Hsapiens.UCSC.hg19.knownGene, by="gene")

## GRangesList object of length 23459:
## $`1`
## GRanges object with 2 ranges and 2 metadata columns:
##       seqnames            ranges strand |     tx_id     tx_name
##          <Rle>         <IRanges>  <Rle> | <integer> <character>
##   [1]    chr19 58858172-58864865      - |     70455  uc002qsd.4
##   [2]    chr19 58859832-58874214      - |     70456  uc002qsf.2
##   -------
##   seqinfo: 93 sequences (1 circular) from hg19 genome
## 
## $`10`
## GRanges object with 1 range and 2 metadata columns:
##       seqnames            ranges strand |     tx_id     tx_name
##          <Rle>         <IRanges>  <Rle> | <integer> <character>
##   [1]     chr8 18248755-18258723      + |     31944  uc003wyw.1
##   -------
##   seqinfo: 93 sequences (1 circular) from hg19 genome
## 
## $`100`
## GRanges object with 1 range and 2 metadata columns:
##       seqnames            ranges strand |     tx_id     tx_name
##          <Rle>         <IRanges>  <Rle> | <integer> <character>
##   [1]    chr20 43248163-43280376      - |     72132  uc002xmj.3
##   -------
##   seqinfo: 93 sequences (1 circular) from hg19 genome
## 
## ...
## <23456 more elements>

List all the promoter regions 1000bp upstream and 200bp download

promoters(TxDb.Hsapiens.UCSC.hg19.knownGene, upstream=1000, downstream=200,  use.names=TRUE)

## GRanges object with 82960 ranges and 2 metadata columns:
##                    seqnames        ranges strand |     tx_id     tx_name
##                       <Rle>     <IRanges>  <Rle> | <integer> <character>
##   uc001aaa.3           chr1   10874-12073      + |         1  uc001aaa.3
##   uc010nxq.1           chr1   10874-12073      + |         2  uc010nxq.1
##   uc010nxr.1           chr1   10874-12073      + |         3  uc010nxr.1
##   uc001aal.1           chr1   68091-69290      + |         4  uc001aal.1
##   uc001aaq.2           chr1 320084-321283      + |         5  uc001aaq.2
##          ...            ...           ...    ... .       ...         ...
##   uc011mgu.1 chrUn_gl000237     2487-3686      - |     82956  uc011mgu.1
##   uc011mgv.2 chrUn_gl000241   36676-37875      - |     82957  uc011mgv.2
##   uc011mgw.1 chrUn_gl000243   10501-11700      + |     82958  uc011mgw.1
##   uc022brq.1 chrUn_gl000243   12608-13807      + |     82959  uc022brq.1
##   uc022brr.1 chrUn_gl000247     5617-6816      - |     82960  uc022brr.1
##   -------
##   seqinfo: 93 sequences (1 circular) from hg19 genome

How many transcripts does PPARG have, according to UCSC? And does Ensembl agree? Is it right to compare these results?

select(org.Hs.eg.db, "PPARG", "ENTREZID", "SYMBOL")

## 'select()' returned 1:1 mapping between keys and columns

##   SYMBOL ENTREZID
## 1  PPARG     5468

# so entrez id is 5468

transcriptsBy(TxDb.Hsapiens.UCSC.hg19.knownGene)[["5468"]]

## GRanges object with 10 ranges and 2 metadata columns:
##        seqnames            ranges strand |     tx_id     tx_name
##           <Rle>         <IRanges>  <Rle> | <integer> <character>
##    [1]     chr3 12328984-12475855      + |     13253  uc031rym.1
##    [2]     chr3 12329349-12475855      + |     13254  uc003bwr.3
##    [3]     chr3 12329349-12475855      + |     13255  uc003bws.3
##    [4]     chr3 12330436-12421430      + |     13256  uc010hdz.1
##    [5]     chr3 12330436-12448345      + |     13257  uc003bwt.1
##    [6]     chr3 12330436-12475855      + |     13258  uc003bwu.3
##    [7]     chr3 12353879-12475855      + |     13259  uc003bwv.3
##    [8]     chr3 12393001-12448345      + |     13260  uc003bww.1
##    [9]     chr3 12393001-12475855      + |     13261  uc003bwx.3
##   [10]     chr3 12421203-12458653      + |     13262  uc010hea.1
##   -------
##   seqinfo: 93 sequences (1 circular) from hg19 genome

select(org.Hs.eg.db, "PPARG", "ENSEMBL", "SYMBOL")

## 'select()' returned 1:1 mapping between keys and columns

##   SYMBOL         ENSEMBL
## 1  PPARG ENSG00000132170

# so ensembl id is ENSG00000132170

transcriptsBy(EnsDb.Hsapiens.v86)[["ENSG00000132170"]]

## GRanges object with 14 ranges and 6 metadata columns:
##        seqnames            ranges strand |           tx_id
##           <Rle>         <IRanges>  <Rle> |     <character>
##    [1]        3 12287368-12434344      + | ENST00000397010
##    [2]        3 12287388-12381491      + | ENST00000397029
##    [3]        3 12287860-12434344      + | ENST00000309576
##    [4]        3 12287901-12434311      + | ENST00000397015
##    [5]        3 12288838-12379834      + | ENST00000455517
##    ...      ...               ...    ... .             ...
##   [10]        3 12312388-12434216      + | ENST00000397000
##   [11]        3 12351472-12406846      + | ENST00000477039
##   [12]        3 12351472-12434356      + | ENST00000287820
##   [13]        3 12351569-12434344      + | ENST00000397023
##   [14]        3 12379706-12417154      + | ENST00000396999
##                     tx_biotype tx_cds_seq_start tx_cds_seq_end
##                    <character>        <integer>      <integer>
##    [1]          protein_coding         12379706       12434145
##    [2]          protein_coding         12379706       12381491
##    [3]          protein_coding         12379706       12434145
##    [4]          protein_coding         12379706       12434145
##    [5]          protein_coding         12379706       12379834
##    ...                     ...              ...            ...
##   [10]          protein_coding         12379706       12433915
##   [11]         retained_intron             <NA>           <NA>
##   [12]          protein_coding         12351593       12434145
##   [13] nonsense_mediated_decay         12351593       12371891
##   [14] nonsense_mediated_decay         12379706       12399373
##                gene_id         tx_name
##            <character>     <character>
##    [1] ENSG00000132170 ENST00000397010
##    [2] ENSG00000132170 ENST00000397029
##    [3] ENSG00000132170 ENST00000309576
##    [4] ENSG00000132170 ENST00000397015
##    [5] ENSG00000132170 ENST00000455517
##    ...             ...             ...
##   [10] ENSG00000132170 ENST00000397000
##   [11] ENSG00000132170 ENST00000477039
##   [12] ENSG00000132170 ENST00000287820
##   [13] ENSG00000132170 ENST00000397023
##   [14] ENSG00000132170 ENST00000396999
##   -------
##   seqinfo: 357 sequences from GRCh38 genome

# No they don't agree 10 from entrez and 14 from ensembl
# No the builds are different 

library(TxDb.Hsapiens.UCSC.hg38.knownGene)
transcriptsBy(TxDb.Hsapiens.UCSC.hg38.knownGene)[["5468"]]

## GRanges object with 17 ranges and 2 metadata columns:
##        seqnames            ranges strand |     tx_id            tx_name
##           <Rle>         <IRanges>  <Rle> | <integer>        <character>
##    [1]     chr3 12287368-12434344      + |     33852  ENST00000397010.6
##    [2]     chr3 12287388-12381491      + |     33853  ENST00000397029.7
##    [3]     chr3 12287860-12434344      + |     33854 ENST00000309576.10
##    [4]     chr3 12287901-12434311      + |     33855  ENST00000397015.6
##    [5]     chr3 12287943-12434162      + |     33856  ENST00000643888.1
##    ...      ...               ...    ... .       ...                ...
##   [13]     chr3 12312388-12434216      + |     33864  ENST00000397000.5
##   [14]     chr3 12351472-12406846      + |     33866  ENST00000477039.5
##   [15]     chr3 12351472-12434356      + |     33867 ENST00000287820.10
##   [16]     chr3 12351569-12434344      + |     33868  ENST00000397023.5
##   [17]     chr3 12379706-12417154      + |     33869  ENST00000396999.2
##   -------
##   seqinfo: 595 sequences (1 circular) from hg38 genome

# The still don't agree 17 to 14
# There is a difference. Which is better? for you to decide

How many genes are between 2858473 and 3271812 on chr2 in the hg19 genome?
- Hint: you make a GRanges object

gns <- genes(TxDb.Hsapiens.UCSC.hg19.knownGene) 
gns[gns %over%  GRanges("chr2:2858473-3271812")]

## GRanges object with 1 range and 1 metadata column:
##        seqnames          ranges strand |     gene_id
##           <Rle>       <IRanges>  <Rle> | <character>
##   7260     chr2 3192741-3381653      - |        7260
##   -------
##   seqinfo: 93 sequences (1 circular) from hg19 genome

## OR

subsetByOverlaps(gns, GRanges("chr2:2858473-3271812"))

## GRanges object with 1 range and 1 metadata column:
##        seqnames          ranges strand |     gene_id
##           <Rle>       <IRanges>  <Rle> | <character>
##   7260     chr2 3192741-3381653      - |        7260
##   -------
##   seqinfo: 93 sequences (1 circular) from hg19 genome

3 Organism.Db

Get all the GO terms for BRCA1

select(Homo.sapiens, "BRCA1", "GO", "SYMBOL")

## 'select()' returned 1:many mapping between keys and columns

##    SYMBOL         GO EVIDENCE ONTOLOGY
## 1   BRCA1 GO:0000151      NAS       CC
## 2   BRCA1 GO:0000724      IDA       BP
## 3   BRCA1 GO:0000729      TAS       BP
## 4   BRCA1 GO:0000800      IDA       CC
## 5   BRCA1 GO:0003677      TAS       MF
## 6   BRCA1 GO:0003684      IEA       MF
## 7   BRCA1 GO:0003713      IMP       MF
## 8   BRCA1 GO:0003713      NAS       MF
## 9   BRCA1 GO:0003723      IDA       MF
## 10  BRCA1 GO:0004842      IDA       MF
## 11  BRCA1 GO:0005515      IPI       MF
## 12  BRCA1 GO:0005634      IDA       CC
## 13  BRCA1 GO:0005654      TAS       CC
## 14  BRCA1 GO:0005694      ISS       CC
## 15  BRCA1 GO:0005737      IDA       CC
## 16  BRCA1 GO:0005886      IDA       CC
## 17  BRCA1 GO:0006260      TAS       BP
## 18  BRCA1 GO:0006301      IDA       BP
## 19  BRCA1 GO:0006302      IDA       BP
## 20  BRCA1 GO:0006302      IMP       BP
## 21  BRCA1 GO:0006303      TAS       BP
## 22  BRCA1 GO:0006349      IEA       BP
## 23  BRCA1 GO:0006357      IMP       BP
## 24  BRCA1 GO:0006359      TAS       BP
## 25  BRCA1 GO:0006633      IEA       BP
## 26  BRCA1 GO:0006915      TAS       BP
## 27  BRCA1 GO:0006974      TAS       BP
## 28  BRCA1 GO:0006978      TAS       BP
## 29  BRCA1 GO:0007059      IMP       BP
## 30  BRCA1 GO:0007098      IEA       BP
## 31  BRCA1 GO:0008270      IEA       MF
## 32  BRCA1 GO:0008274      NAS       CC
## 33  BRCA1 GO:0008630      IDA       BP
## 34  BRCA1 GO:0010212      IMP       BP
## 35  BRCA1 GO:0010575      IMP       BP
## 36  BRCA1 GO:0010628      IMP       BP
## 37  BRCA1 GO:0015631      NAS       MF
## 38  BRCA1 GO:0016567      IDA       BP
## 39  BRCA1 GO:0016579      TAS       BP
## 40  BRCA1 GO:0019899      IPI       MF
## 41  BRCA1 GO:0030521      NAS       BP
## 42  BRCA1 GO:0031398      IDA       BP
## 43  BRCA1 GO:0031436      IDA       CC
## 44  BRCA1 GO:0031625      IPI       MF
## 45  BRCA1 GO:0032991      IDA       CC
## 46  BRCA1 GO:0033147      IMP       BP
## 47  BRCA1 GO:0035066      IDA       BP
## 48  BRCA1 GO:0042127      TAS       BP
## 49  BRCA1 GO:0042802      IPI       MF
## 50  BRCA1 GO:0042981      TAS       BP
## 51  BRCA1 GO:0043627      IDA       BP
## 52  BRCA1 GO:0044030      IEA       BP
## 53  BRCA1 GO:0044212      IDA       MF
## 54  BRCA1 GO:0044818      IEA       BP
## 55  BRCA1 GO:0045717      IMP       BP
## 56  BRCA1 GO:0045739      IMP       BP
## 57  BRCA1 GO:0045766      IMP       BP
## 58  BRCA1 GO:0045892      IDA       BP
## 59  BRCA1 GO:0045893      IDA       BP
## 60  BRCA1 GO:0045893      NAS       BP
## 61  BRCA1 GO:0045893      TAS       BP
## 62  BRCA1 GO:0045944      IDA       BP
## 63  BRCA1 GO:0045944      IMP       BP
## 64  BRCA1 GO:0046600      NAS       BP
## 65  BRCA1 GO:0050681      NAS       MF
## 66  BRCA1 GO:0051571      IDA       BP
## 67  BRCA1 GO:0051572      IEA       BP
## 68  BRCA1 GO:0051573      IDA       BP
## 69  BRCA1 GO:0051574      IEA       BP
## 70  BRCA1 GO:0051865      IDA       BP
## 71  BRCA1 GO:0070063      IDA       MF
## 72  BRCA1 GO:0070317      TAS       BP
## 73  BRCA1 GO:0070512      IDA       BP
## 74  BRCA1 GO:0070531      IDA       CC
## 75  BRCA1 GO:0071158      IDA       BP
## 76  BRCA1 GO:0071356      IMP       BP
## 77  BRCA1 GO:0071681      IDA       BP
## 78  BRCA1 GO:0072425      IMP       BP
## 79  BRCA1 GO:0085020      IDA       BP
## 80  BRCA1 GO:1901796      TAS       BP
## 81  BRCA1 GO:1902042      IMP       BP
## 82  BRCA1 GO:1990904      IDA       CC
## 83  BRCA1 GO:2000378      IMP       BP
## 84  BRCA1 GO:2000617      IDA       BP
## 85  BRCA1 GO:2000620      IDA       BP

What gene does the UCSC transcript ID uc002fai.3 map to?

select(Homo.sapiens, "uc002fai.3", "SYMBOL", "TXNAME")

## 'select()' returned 1:1 mapping between keys and columns

##       TXNAME SYMBOL
## 1 uc002fai.3  ZNF23

How many other transcripts does that gene have?

transcriptsBy(Homo.sapiens)[[mapIds(Homo.sapiens, "uc002fai.3", "ENTREZID","TXNAME")]]

## 'select()' returned 1:1 mapping between keys and columns

## GRanges object with 6 ranges and 2 metadata columns:
##       seqnames            ranges strand |     tx_id     tx_name
##          <Rle>         <IRanges>  <Rle> | <integer> <character>
##   [1]    chr16 71481503-71496117      - |     59822  uc002faf.3
##   [2]    chr16 71481503-71496273      - |     59823  uc002fah.3
##   [3]    chr16 71481506-71487876      - |     59824  uc002fad.3
##   [4]    chr16 71481506-71496273      - |     59825  uc002fag.3
##   [5]    chr16 71481506-71496273      - |     59826  uc010vmf.2
##   [6]    chr16 71481506-71523217      - |     59827  uc002fai.3
##   -------
##   seqinfo: 93 sequences (1 circular) from hg19 genome

Get all the transcripts from the hg19 genome build, along with their Ensembl gene ID, UCSC transcript ID and gene symbol

transcriptsBy(Homo.sapiens, columns = c("ENSEMBL", "SYMBOL"))

## 'select()' returned 1:many mapping between keys and columns

## GRangesList object of length 23459:
## $`1`
## GRanges object with 2 ranges and 3 metadata columns:
##       seqnames            ranges strand |     tx_name         ENSEMBL
##          <Rle>         <IRanges>  <Rle> | <character> <CharacterList>
##   [1]    chr19 58858172-58864865      - |  uc002qsd.4 ENSG00000121410
##   [2]    chr19 58859832-58874214      - |  uc002qsf.2 ENSG00000121410
##                SYMBOL
##       <CharacterList>
##   [1]            A1BG
##   [2]            A1BG
##   -------
##   seqinfo: 93 sequences (1 circular) from hg19 genome
## 
## $`10`
## GRanges object with 1 range and 3 metadata columns:
##       seqnames            ranges strand |     tx_name         ENSEMBL
##          <Rle>         <IRanges>  <Rle> | <character> <CharacterList>
##   [1]     chr8 18248755-18258723      + |  uc003wyw.1 ENSG00000156006
##                SYMBOL
##       <CharacterList>
##   [1]            NAT2
##   -------
##   seqinfo: 93 sequences (1 circular) from hg19 genome
## 
## $`100`
## GRanges object with 1 range and 3 metadata columns:
##       seqnames            ranges strand |     tx_name         ENSEMBL
##          <Rle>         <IRanges>  <Rle> | <character> <CharacterList>
##   [1]    chr20 43248163-43280376      - |  uc002xmj.3 ENSG00000196839
##                SYMBOL
##       <CharacterList>
##   [1]             ADA
##   -------
##   seqinfo: 93 sequences (1 circular) from hg19 genome
## 
## ...
## <23456 more elements>

4 Organism.dplyr

How many supported organisms are implemented in Organism.dplyr?

library(Organism.dplyr)
supportedOrganisms()

## # A tibble: 21 x 3
##    organism                OrgDb        TxDb                                 
##    <chr>                   <chr>        <chr>                                
##  1 Bos taurus              org.Bt.eg.db TxDb.Btaurus.UCSC.bosTau8.refGene    
##  2 Caenorhabditis elegans  org.Ce.eg.db TxDb.Celegans.UCSC.ce11.refGene      
##  3 Caenorhabditis elegans  org.Ce.eg.db TxDb.Celegans.UCSC.ce6.ensGene       
##  4 Canis familiaris        org.Cf.eg.db TxDb.Cfamiliaris.UCSC.canFam3.refGene
##  5 Drosophila melanogaster org.Dm.eg.db TxDb.Dmelanogaster.UCSC.dm3.ensGene  
##  6 Drosophila melanogaster org.Dm.eg.db TxDb.Dmelanogaster.UCSC.dm6.ensGene  
##  7 Danio rerio             org.Dr.eg.db TxDb.Drerio.UCSC.danRer10.refGene    
##  8 Gallus gallus           org.Gg.eg.db TxDb.Ggallus.UCSC.galGal4.refGene    
##  9 Homo sapiens            org.Hs.eg.db TxDb.Hsapiens.UCSC.hg18.knownGene    
## 10 Homo sapiens            org.Hs.eg.db TxDb.Hsapiens.UCSC.hg19.knownGene    
## # … with 11 more rows

dim(supportedOrganisms())[1]

## [1] 21

Display the ensembl Id and genename description for symbol “NAT2”.

src <- src_organism(dbpath = hg38light())
select(src, "NAT2", c("ensembl", "genename"),"symbol")

##   symbol         ensembl              genename
## 1   NAT2 ENSG00000156006 N-acetyltransferase 2

Show all the alias for “NAT2” in the database.

select(src, "NAT2", c("alias"),"symbol")

##   symbol alias
## 1   NAT2  AAC2
## 2   NAT2 NAT-2
## 3   NAT2  PNAT
## 4   NAT2  NAT2

Get all the promoter regions.

promoters(src)

## GRanges object with 77 ranges and 2 metadata columns:
##                                seqnames              ranges strand |
##                                   <Rle>           <IRanges>  <Rle> |
##   ENST00000336199.9                chr1 243843037-243845236      - |
##   ENST00000366540.5                chr1 243843385-243845584      - |
##   ENST00000263826.9                chr1 243843083-243845282      - |
##   ENST00000366539.5                chr1 243849929-243852128      - |
##   ENST00000492957.1                chr1 243614947-243617146      - |
##                 ...                 ...                 ...    ... .
##   ENST00000619536.4 chr1_KI270763v1_alt       855483-857682      - |
##   ENST00000621586.3 chr1_KI270763v1_alt       855069-857268      - |
##   ENST00000629115.1 chr1_KI270763v1_alt       867343-869542      - |
##   ENST00000629924.2 chr1_KI270763v1_alt       826591-828790      - |
##   ENST00000627932.1 chr1_KI270763v1_alt       830085-832284      - |
##                         tx_id           tx_name
##                     <integer>       <character>
##   ENST00000336199.9     18165 ENST00000336199.9
##   ENST00000366540.5     18166 ENST00000366540.5
##   ENST00000263826.9     18167 ENST00000263826.9
##   ENST00000366539.5     18168 ENST00000366539.5
##   ENST00000492957.1     18169 ENST00000492957.1
##                 ...       ...               ...
##   ENST00000619536.4    206918 ENST00000619536.4
##   ENST00000621586.3    206919 ENST00000621586.3
##   ENST00000629115.1    206920 ENST00000629115.1
##   ENST00000629924.2    206921 ENST00000629924.2
##   ENST00000627932.1    206922 ENST00000627932.1
##   -------
##   seqinfo: 595 sequences (1 circular) from hg38 genome

Extract the “id” table.

tbl(src, "id")

## # Source:   table<id> [?? x 6]
## # Database: sqlite 3.22.0 []
##    entrez map      ensembl         symbol genename               alias   
##    <chr>  <chr>    <chr>           <chr>  <chr>                  <chr>   
##  1 1      19q13.43 ENSG00000121410 A1BG   alpha-1-B glycoprotein A1B     
##  2 1      19q13.43 ENSG00000121410 A1BG   alpha-1-B glycoprotein ABG     
##  3 1      19q13.43 ENSG00000121410 A1BG   alpha-1-B glycoprotein GAB     
##  4 1      19q13.43 ENSG00000121410 A1BG   alpha-1-B glycoprotein HYST2477
##  5 1      19q13.43 ENSG00000121410 A1BG   alpha-1-B glycoprotein A1BG    
##  6 10     8p22     ENSG00000156006 NAT2   N-acetyltransferase 2  AAC2    
##  7 10     8p22     ENSG00000156006 NAT2   N-acetyltransferase 2  NAT-2   
##  8 10     8p22     ENSG00000156006 NAT2   N-acetyltransferase 2  PNAT    
##  9 10     8p22     ENSG00000156006 NAT2   N-acetyltransferase 2  NAT2    
## 10 100    20q13.12 ENSG00000196839 ADA    adenosine deaminase    ADA     
## # … with more rows

Display Gene ontology (GO) information for gene symbol “Nat2”.

src_tbls(src)

##  [1] "id_accession"  "id_transcript" "id"            "id_omim_pm"   
##  [5] "id_protein"    "id_go"         "id_go_all"     "ranges_gene"  
##  [9] "ranges_tx"     "ranges_exon"   "ranges_cds"

colnames(tbl(src, "id_go"))

## [1] "entrez"   "go"       "evidence" "ontology"

inner_join(tbl(src, "id"), tbl(src, "id_go")) %>%
            filter(symbol == "NAT2") %>%
            dplyr::select(entrez, ensembl, symbol, go, evidence, ontology)

## Joining, by = "entrez"

## # Source:   lazy query [?? x 6]
## # Database: sqlite 3.22.0 []
##    entrez ensembl         symbol go         evidence ontology
##    <chr>  <chr>           <chr>  <chr>      <chr>    <chr>   
##  1 10     ENSG00000156006 NAT2   GO:0004060 TAS      MF      
##  2 10     ENSG00000156006 NAT2   GO:0005515 IPI      MF      
##  3 10     ENSG00000156006 NAT2   GO:0005829 TAS      CC      
##  4 10     ENSG00000156006 NAT2   GO:0006805 TAS      BP      
##  5 10     ENSG00000156006 NAT2   GO:0004060 TAS      MF      
##  6 10     ENSG00000156006 NAT2   GO:0005515 IPI      MF      
##  7 10     ENSG00000156006 NAT2   GO:0005829 TAS      CC      
##  8 10     ENSG00000156006 NAT2   GO:0006805 TAS      BP      
##  9 10     ENSG00000156006 NAT2   GO:0004060 TAS      MF      
## 10 10     ENSG00000156006 NAT2   GO:0005515 IPI      MF      
## # … with more rows

5 BSgenome

How many BSgenomes are there in Bioconductor?

length(BiocManager::available("BSgenome"))

## [1] 94

Get the sequence from UCSC hg19 builds for chromosome 1. And print the frequency of each letter.

library(BSgenome.Hsapiens.UCSC.hg19)
seq = getSeq(BSgenome.Hsapiens.UCSC.hg19, "chr1")
alphabetFrequency(seq)

##        A        C        G        T        M        R        W        S 
## 65570891 47024412 47016562 65668756        0        0        0        0 
##        Y        K        V        H        D        B        N        - 
##        0        0        0        0        0        0 23970000        0 
##        +        . 
##        0        0

# only bases
alphabetFrequency(seq, baseOnly=TRUE)

##        A        C        G        T    other 
## 65570891 47024412 47016562 65668756 23970000

Get the sequence corresponding to chr6 35310335-35395968. Get the complement and reverse complement of it.

seq = getSeq(BSgenome.Hsapiens.UCSC.hg19, GRanges("chr6:35310335-35395968"))
complement(seq)

##   A DNAStringSet instance of length 1
##     width seq
## [1] 85634 CGCCTCGCACACTGCGACGCCGGCGGCGCCTG...TACCCTTAATTTATAAATTCTCGACTGACCTT

reverseComplement(seq)

##   A DNAStringSet instance of length 1
##     width seq
## [1] 85634 TTCCAGTCAGCTCTTAAATATTTAATTCCCAT...GTCCGCGGCGGCCGCAGCGTCACACGCTCCGC

Get the sequences for all transcripts of the TP53 gene

tp53 <- transcriptsBy(Homo.sapiens)[[mapIds(Homo.sapiens, "TP53", "ENTREZID","SYMBOL")]]

## 'select()' returned 1:1 mapping between keys and columns

getSeq(Hsapiens, tp53)

##   A DNAStringSet instance of length 17
##      width seq
##  [1] 14841 CCAGACTGCCTTCCGGGTCACTGCCATGGAGG...CAGCCTGTGCAACAGAGTGAGACTCTGTCTC
##  [2] 10534 CCAGACTGCCTTCCGGGTCACTGCCATGGAGG...TAGCCTGGGCAACAGAGCAAGACTCCATCTC
##  [3]  5207 CACTGCCCAACAACACCAGCTCCTCTCCCCAG...ACTTTGCTGCCACCTGTGTGTCTGAGGGGTG
##  [4]  7092 TGAGGCCAGGAGATGGAGGCTGCAGTGAGCTG...ACTTTGCTGCCACCTGTGTGTCTGAGGGGTG
##  [5]  7092 TGAGGCCAGGAGATGGAGGCTGCAGTGAGCTG...ACTTTGCTGCCACCTGTGTGTCTGAGGGGTG
##  ...   ... ...
## [13]  1056 TACTCCCCTGCCCTCAACAAGATGTTTTGCCA...TCCTCACCATCATCACACTGGAAGACTCCAG
## [14] 13370 GATGGGATTGGGGTTTTCCCCTCCCATGTGCT...TCCTCACCATCATCACACTGGAAGACTCCAG
## [15] 13370 GATGGGATTGGGGTTTTCCCCTCCCATGTGCT...TCCTCACCATCATCACACTGGAAGACTCCAG
## [16] 13018 GATGGGATTGGGGTTTTCCCCTCCCATGTGCT...GGTGAAACCCCGTCTCTACTGAAAAATACAA
## [17] 11557 GATGGGATTGGGGTTTTCCCCTCCCATGTGCT...TTCTGGGACAGCCAAGTCTGTGACTTGCACG

6 AnnotationHub

How many resources are available in AnnotationHub?

hub <- AnnotationHub()

## snapshotDate(): 2019-07-10

length(hub)

## [1] 46429

How many resources are on AnnotationHub for Atlantic salmon (Salmo salar)?

length(query(hub, "salmo salar"))

## [1] 1

Get the most recent Ensembl build for domesticated dog (Canis familiaris) and make a TxDb

dog <- query(hub, c("canis familiaris","ensembl","gtf"))
doggy <- dog[["AH69344"]]

## downloading 0 resources

## loading from cache

## Importing File into R ..

TxDoggy <- makeTxDbFromGRanges(doggy)

## Warning in .get_cds_IDX(mcols0$type, mcols0$phase): The "phase" metadata column contains non-NA values for features of
##   type stop_codon. This information was ignored.

Get information on the following ids “AH73986”, “AH73881”,“AH64538”, “AH69344”.

getInfoOnIds(hub, c("AH73986", "AH73881","AH64538", "AH69344"))

##          ah_id fetch_id                             title rdataclass status
## 291824 AH73986    80732 Ensembl 79 EnsDb for Homo sapiens      EnsDb Public
## 291719 AH73881    80627 Ensembl 97 EnsDb for Homo sapiens      EnsDb Public
## 282384 AH64538    71284 Canis_familiaris.CanFam3.1.94.gtf    GRanges Public
## 287182 AH69344    76090 Canis_familiaris.CanFam3.1.96.gtf    GRanges Public
##        biocversion rdatadateadded rdatadateremoved file_size
## 291824         3.9     2019-05-02             <NA> 334757888
## 291719         3.9     2019-05-02             <NA> 380694528
## 282384         3.8     2018-10-04             <NA>      <NA>
## 287182         3.8     2019-04-22             <NA>      <NA>

7 biomaRt

List available marts

listMarts()

##                biomart               version
## 1 ENSEMBL_MART_ENSEMBL      Ensembl Genes 97
## 2   ENSEMBL_MART_MOUSE      Mouse strains 97
## 3     ENSEMBL_MART_SNP  Ensembl Variation 97
## 4 ENSEMBL_MART_FUNCGEN Ensembl Regulation 97

Use mart corresponding to Ensembl Genes 97 and list available datasets.

mart <- useMart("ENSEMBL_MART_ENSEMBL")
head(listDatasets(mart))

##                        dataset                           description
## 1 abrachyrhynchus_gene_ensembl Pink-footed goose genes (ASM259213v1)
## 2     acalliptera_gene_ensembl      Eastern happy genes (fAstCal1.2)
## 3   acarolinensis_gene_ensembl        Anole lizard genes (AnoCar2.0)
## 4    acitrinellus_gene_ensembl        Midas cichlid genes (Midas_v5)
## 5        ahaastii_gene_ensembl    Great spotted kiwi genes (aptHaa1)
## 6    amelanoleuca_gene_ensembl                 Panda genes (ailMel1)
##       version
## 1 ASM259213v1
## 2  fAstCal1.2
## 3   AnoCar2.0
## 4    Midas_v5
## 5     aptHaa1
## 6     ailMel1

Create mart with Homo sapiens ensembl genes dataset

mart <- useMart("ENSEMBL_MART_ENSEMBL","hsapiens_gene_ensembl")

Get the Ensembl gene IDs and HUGO symbol for Entrez Gene IDs 672, 5468 and 7157

getBM(c("ensembl_gene_id","hgnc_symbol","entrezgene_id"), "entrezgene_id", c("672","5468","7157"), mart)

## Cache found

##   ensembl_gene_id hgnc_symbol entrezgene_id
## 1 ENSG00000132170       PPARG          5468
## 2 ENSG00000012048       BRCA1           672
## 3 ENSG00000141510        TP53          7157

What do you get if you query for the ‘gene_exon’ for GAPDH?

res <- getBM(c("hgnc_symbol","gene_exon"),"hgnc_symbol","GAPDH",mart)

## Cache found

What are the avaiable search terms?

nms <- listAttributes(mart)$name

8 BiocFileCache

Create a temporary cache using tempdir(). We use a temporary directory so it will be cleaned up at the end. If using to manage file would want a more permanent location.

# the ask is not needed here
# because this is a vignette and not an interactive session when knitting, 
# this gives permission to create the cache on the system. 
bfc <- BiocFileCache(tempdir(), ask=FALSE)

What is the path to your cache?

bfccache(bfc)

## [1] "/tmp/RtmpYoNrM1"

bfc

## class: BiocFileCache
## bfccache: /tmp/RtmpYoNrM1
## bfccount: 0
## For more information see: bfcinfo() or bfcquery()

What columns do we store in the database by default?

names(bfcinfo(bfc))

##  [1] "rid"                "rname"              "create_time"       
##  [4] "access_time"        "rpath"              "rtype"             
##  [7] "fpath"              "last_modified_time" "etag"              
## [10] "expires"

Get a file path to save an object so that it is tracked in the cache. Assume the object will be saved as a RData object.

savepath <- bfcnew(bfc, "NewResource", ext=".RData")
savepath

##                                              BFC1 
## "/tmp/RtmpYoNrM1/4dd313a59cd5_4dd313a59cd5.RData"

# we can see the info has been added
bfcinfo(bfc)

## # A tibble: 1 x 10
##   rid   rname create_time access_time rpath rtype fpath last_modified_t…
##   <chr> <chr> <chr>       <chr>       <chr> <chr> <chr>            <dbl>
## 1 BFC1  NewR… 2019-07-19… 2019-07-19… /tmp… rela… 4dd3…               NA
## # … with 2 more variables: etag <chr>, expires <dbl>

# use this path to save an object
m = matrix(1:12, nrow=3)
save(m, file=savepath)

Add a remote resource to the cache. The httpbin site has mock urls that can be used for testing “http://httpbin.org/get”. Give this resource the name “TestWeb”

url <- "http://httpbin.org/get"
resource1 <- bfcadd(bfc, "TestWeb", fpath=url)
rid1 <- names(resource1)

Add another remote resource to the cache but do not automatically download. Call this resource “TestNoDownload”. You can use the same url as above or any of your choosing.

resource2 <- bfcadd(bfc, "TestNoDownload", fpath=url, download=FALSE)
rid2 <- names(resource2)

Check that the resources have been added to the cache. Do a query for resources matching “Test”. How many resources match?

bfcinfo(bfc)

## # A tibble: 3 x 10
##   rid   rname create_time access_time rpath rtype fpath last_modified_t…
##   <chr> <chr> <chr>       <chr>       <chr> <chr> <chr>            <dbl>
## 1 BFC1  NewR… 2019-07-19… 2019-07-19… /tmp… rela… 4dd3…               NA
## 2 BFC2  Test… 2019-07-19… 2019-07-19… /tmp… web   http…               NA
## 3 BFC3  Test… 2019-07-19… 2019-07-19… /tmp… web   http…               NA
## # … with 2 more variables: etag <chr>, expires <dbl>

bfccount(bfc)

## [1] 3

res <- bfcquery(bfc, "Test")
res

## # A tibble: 2 x 10
##   rid   rname create_time access_time rpath rtype fpath last_modified_t…
##   <chr> <chr> <chr>       <chr>       <chr> <chr> <chr>            <dbl>
## 1 BFC2  Test… 2019-07-19… 2019-07-19… /tmp… web   http…               NA
## 2 BFC3  Test… 2019-07-19… 2019-07-19… /tmp… web   http…               NA
## # … with 2 more variables: etag <chr>, expires <dbl>

bfccount(res)

## [1] 2

Do resources need to be updated?

bfcneedsupdate(bfc)

## BFC2 BFC3 
##   NA TRUE

Make a data.frame of metadata and add it to the cache. Do a query that would use the metadata.

# the metdata data.frmae can be for a subset of data or the entire cache.
# the constrain is a column rid must match the rids in the cache.
rid = c("BFC1", "BFC3")
vls1 = c(1000, 2000)
vls2 = c("An experiment", "Online resource")

meta <- as.data.frame(list(rid=rid, vls1=vls1, vls2=vls2))
bfcmeta(bfc, name="resourceData") <- meta

names(bfcinfo(bfc))

##  [1] "rid"                "rname"              "create_time"       
##  [4] "access_time"        "rpath"              "rtype"             
##  [7] "fpath"              "last_modified_time" "etag"              
## [10] "expires"            "vls1"               "vls2"

bfcquerycols(bfc)

##  [1] "rid"                "rname"              "create_time"       
##  [4] "access_time"        "rpath"              "rtype"             
##  [7] "fpath"              "last_modified_time" "etag"              
## [10] "expires"            "vls1"               "vls2"

library(dplyr)
bfcinfo(bfc) %>% dplyr::select("rid", "vls2")

## # A tibble: 3 x 2
##   rid   vls2           
##   <chr> <chr>          
## 1 BFC1  An experiment  
## 2 BFC2  <NA>           
## 3 BFC3  Online resource

res <- bfcquery(bfc, field="vls2", "experiment")
res

## # A tibble: 1 x 12
##   rid   rname create_time access_time rpath rtype fpath last_modified_t…
##   <chr> <chr> <chr>       <chr>       <chr> <chr> <chr>            <dbl>
## 1 BFC1  NewR… 2019-07-19… 2019-07-19… /tmp… rela… 4dd3…               NA
## # … with 4 more variables: etag <chr>, expires <dbl>, vls1 <dbl>, vls2 <chr>

# get id of resource matching query
# use that to load the resource
bfcrid(res)

## [1] "BFC1"

load(bfcrpath(bfc, rids=bfcrid(res)))

Lab 6: Answer Key

19 July 2019

Contents