GenomicDataCommons
?From https://support.bioconductor.org/p/9138939/.
library(GenomicDataCommons,quietly = TRUE)
I made a small change to the filtering expression approach based on
changes to lazy evaluation best practices. There is now no need to
include the ~
in the filter expression. So:
q = files() |>
GenomicDataCommons::filter(
cases.project.project_id == 'TCGA-COAD' &
data_type == 'Aligned Reads' &
experimental_strategy == 'RNA-Seq' &
data_format == 'BAM')
And get a count of the results:
count(q)
## [1] 1188
And the manifest.
manifest(q)
id <chr> | proportion_reads_mapped <dbl> | access <chr> | |
---|---|---|---|
bd92bd8d-e4a8-4a6a-b954-710db439e210 | 0.9822403 | controlled | |
f565bd2a-f96f-4a77-9f83-d839666f7d0d | NA | controlled | |
cc3a9c05-0a89-4083-8f30-e39d1b301e55 | 0.9749086 | controlled | |
45f9ebf2-30af-445b-84a3-f18a39cf3021 | 0.9880719 | controlled | |
607a8389-340c-4d5e-8f09-a76a47b421d2 | NA | controlled | |
1270d172-4d58-4e16-ad6d-1c7357fcd07a | 0.9817795 | controlled | |
a2524e56-2555-44ec-af35-c67a75fe2ca3 | NA | controlled | |
79ba4256-9439-4065-88d8-9fb4fa060142 | NA | controlled | |
d78f69bd-a335-4e2e-a877-d763b970916a | 0.9861397 | controlled | |
86bc2b10-0d64-4ff0-b47d-021c02631a22 | 0.9922583 | controlled |
Your question about race and ethnicity is a good one.
all_fields = available_fields(files())
And we can grep for race
or ethnic
to get potential matching fields
to look at.
grep('race|ethnic',all_fields,value=TRUE)
## [1] "cases.demographic.ethnicity"
## [2] "cases.demographic.race"
## [3] "cases.follow_ups.hormonal_contraceptive_type"
## [4] "cases.follow_ups.hormonal_contraceptive_use"
## [5] "cases.follow_ups.other_clinical_attributes.hormonal_contraceptive_type"
## [6] "cases.follow_ups.other_clinical_attributes.hormonal_contraceptive_use"
## [7] "cases.follow_ups.scan_tracer_used"
Now, we can check available values for each field to determine how to complete our filter expressions.
available_values('files',"cases.demographic.ethnicity")
## [1] "not hispanic or latino" "not reported" "hispanic or latino"
## [4] "unknown" "_missing"
available_values('files',"cases.demographic.race")
## [1] "white"
## [2] "not reported"
## [3] "black or african american"
## [4] "asian"
## [5] "unknown"
## [6] "other"
## [7] "american indian or alaska native"
## [8] "native hawaiian or other pacific islander"
## [9] "not allowed to collect"
## [10] "_missing"
We can complete our filter expression now to limit to white
race only.
q_white_only = q |>
GenomicDataCommons::filter(cases.demographic.race=='white')
count(q_white_only)
## [1] 695
manifest(q_white_only)
id <chr> | proportion_reads_mapped <dbl> | access <chr> | |
---|---|---|---|
1270d172-4d58-4e16-ad6d-1c7357fcd07a | 0.9817795 | controlled | |
a2524e56-2555-44ec-af35-c67a75fe2ca3 | NA | controlled | |
79ba4256-9439-4065-88d8-9fb4fa060142 | NA | controlled | |
6739c074-3676-4a11-a288-fff1d07c675f | 0.9944030 | controlled | |
262be809-965f-446a-aaf9-495468e8a0c9 | 0.9778098 | controlled | |
fb0ea225-1004-412e-892a-f01dc9d14581 | NA | controlled | |
79126fea-a11b-4410-9e74-60e333eee910 | 0.9809888 | controlled | |
2b4ef3d7-b8fa-4356-bee1-469244fb09c9 | 0.9830173 | controlled | |
6c083b79-367b-4a67-864f-2bc349335559 | NA | controlled | |
788adca0-1a65-4c33-80b0-ac1f937dd2e4 | NA | controlled |
GenomicDataCommons
?I would like to get the number of cases added (created, any logical datetime would suffice here) to the TCGA project by experiment type. I attempted to get this data via GenomicDataCommons package, but it is giving me I believe the number of files for a given experiment type rather than number cases. How can I get the number of cases for which there is RNA-Seq data?
library(tibble)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:GenomicDataCommons':
##
## count, filter, select
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(GenomicDataCommons)
cases() |>
GenomicDataCommons::filter(
~ project.program.name=='TCGA' & files.experimental_strategy=='RNA-Seq'
) |>
facet(c("files.created_datetime")) |>
aggregations() |>
unname() |>
unlist(recursive = FALSE) |>
as_tibble() |>
dplyr::arrange(dplyr::desc(key))
doc_count <int> | key <chr> | |||
---|---|---|---|---|
271 | 2024-06-14t14:27:00.916424-05:00 | |||
412 | 2024-06-14t13:28:10.644120-05:00 | |||
151 | 2023-03-09t00:35:51.387873-06:00 | |||
79 | 2023-02-19t04:41:11.008116-06:00 | |||
458 | 2023-02-19t04:36:10.605050-06:00 | |||
80 | 2023-02-19t04:28:49.400023-06:00 | |||
178 | 2023-02-19t04:23:49.092629-06:00 | |||
516 | 2023-02-19t04:18:49.453628-06:00 | |||
179 | 2023-02-19t04:13:47.877168-06:00 | |||
290 | 2023-02-19t04:08:47.478925-06:00 |