knitr::opts_chunk$set(message = FALSE, warning = FALSE, comment = NA,
fig.width = 6.25, fig.height = 5)
library(ANCOMBC)
library(tidyverse)
Sparse Estimation of Correlations among Microbiomes (SECOM) (Lin, Eggesbø, and Peddada 2022) is a methodology that aims to detect both linear and nonlinear relationships between a pair of taxa within an ecosystem (e.g., gut) or across ecosystems (e.g., gut and tongue). SECOM corrects both sample-specific and taxon-specific biases and obtains a consistent estimator for the correlation matrix of microbial absolute abundances while maintaining the underlying true sparsity. For more details, please refer to the SECOM paper.
Download package.
if (!requireNamespace("BiocManager", quietly = TRUE))
install.packages("BiocManager")
BiocManager::install("ANCOMBC")
Load the package.
The HITChip Atlas dataset contains genus-level microbiota profiling with HITChip for 1006 western adults with no reported health complications, reported in (Lahti et al. 2014). The dataset is available via the microbiome R package (Lahti et al. 2017) in phyloseq (McMurdie and Holmes 2013) format.
data(atlas1006, package = "microbiome")
tse = mia::makeTreeSummarizedExperimentFromPhyloseq(atlas1006)
# subset to baseline
tse = tse[, tse$time == 0]
# Re-code the bmi group
tse$bmi = recode(tse$bmi_group,
obese = "obese",
severeobese = "obese",
morbidobese = "obese")
# Subset to lean, overweight, and obese subjects
tse = tse[, tse$bmi %in% c("lean", "overweight", "obese")]
# Create the region variable
tse$region = recode(as.character(tse$nationality),
Scandinavia = "NE", UKIE = "NE", SouthEurope = "SE",
CentralEurope = "CE", EasternEurope = "EE",
.missing = "unknown")
# Discard "EE" as it contains only 1 subject
# Discard subjects with missing values of region
tse = tse[, ! tse$region %in% c("EE", "unknown")]
print(tse)
class: TreeSummarizedExperiment
dim: 130 873
metadata(0):
assays(1): counts
rownames(130): Actinomycetaceae Aerococcus ... Xanthomonadaceae
Yersinia et rel.
rowData names(3): Phylum Family Genus
colnames(873): Sample-1 Sample-2 ... Sample-1005 Sample-1006
colData names(12): age sex ... bmi region
reducedDimNames(0):
mainExpName: NULL
altExpNames(0):
rowLinks: NULL
rowTree: NULL
colLinks: NULL
colTree: NULL
set.seed(123)
# Linear relationships
res_linear = secom_linear(data = list(tse), assay_name = "counts",
tax_level = "Phylum", pseudo = 0,
prv_cut = 0.5, lib_cut = 1000, corr_cut = 0.5,
wins_quant = c(0.05, 0.95), method = "pearson",
soft = FALSE, thresh_len = 20, n_cv = 10,
thresh_hard = 0.3, max_p = 0.005, n_cl = 2)
# Nonlinear relationships
res_dist = secom_dist(data = list(tse), assay_name = "counts",
tax_level = "Phylum", pseudo = 0,
prv_cut = 0.5, lib_cut = 1000, corr_cut = 0.5,
wins_quant = c(0.05, 0.95), R = 1000,
thresh_hard = 0.3, max_p = 0.005, n_cl = 2)
corr_linear = res_linear$corr_th
cooccur_linear = res_linear$mat_cooccur
# Filter by co-occurrence
overlap = 10
corr_linear[cooccur_linear < overlap] = 0
df_linear = data.frame(get_upper_tri(corr_linear)) %>%
rownames_to_column("var1") %>%
pivot_longer(cols = -var1, names_to = "var2", values_to = "value") %>%
filter(!is.na(value)) %>%
mutate(value = round(value, 2))
tax_name = sort(union(df_linear$var1, df_linear$var2))
df_linear$var1 = factor(df_linear$var1, levels = tax_name)
df_linear$var2 = factor(df_linear$var2, levels = tax_name)
heat_linear_th = df_linear %>%
ggplot(aes(var2, var1, fill = value)) +
geom_tile(color = "black") +
scale_fill_gradient2(low = "blue", high = "red", mid = "white", na.value = "grey",
midpoint = 0, limit = c(-1,1), space = "Lab",
name = NULL) +
scale_x_discrete(drop = FALSE) +
scale_y_discrete(drop = FALSE) +
geom_text(aes(var2, var1, label = value), color = "black", size = 4) +
labs(x = NULL, y = NULL, title = "Pearson (Thresholding)") +
theme_bw() +
theme(axis.text.x = element_text(angle = 45, vjust = 1, size = 12, hjust = 1,
face = "italic"),
axis.text.y = element_text(size = 12, face = "italic"),
strip.text.x = element_text(size = 14),
strip.text.y = element_text(size = 14),
legend.text = element_text(size = 12),
plot.title = element_text(hjust = 0.5, size = 15),
panel.grid.major = element_blank(),
axis.ticks = element_blank(),
legend.position = "none") +
coord_fixed()
heat_linear_th
corr_linear = res_linear$corr_fl
cooccur_linear = res_linear$mat_cooccur
# Filter by co-occurrence
overlap = 10
corr_linear[cooccur_linear < overlap] = 0
df_linear = data.frame(get_upper_tri(corr_linear)) %>%
rownames_to_column("var1") %>%
pivot_longer(cols = -var1, names_to = "var2", values_to = "value") %>%
filter(!is.na(value)) %>%
mutate(value = round(value, 2))
tax_name = sort(union(df_linear$var1, df_linear$var2))
df_linear$var1 = factor(df_linear$var1, levels = tax_name)
df_linear$var2 = factor(df_linear$var2, levels = tax_name)
heat_linear_fl = df_linear %>%
ggplot(aes(var2, var1, fill = value)) +
geom_tile(color = "black") +
scale_fill_gradient2(low = "blue", high = "red", mid = "white", na.value = "grey",
midpoint = 0, limit = c(-1,1), space = "Lab",
name = NULL) +
scale_x_discrete(drop = FALSE) +
scale_y_discrete(drop = FALSE) +
geom_text(aes(var2, var1, label = value), color = "black", size = 4) +
labs(x = NULL, y = NULL, title = "Pearson (Filtering)") +
theme_bw() +
theme(axis.text.x = element_text(angle = 45, vjust = 1, size = 12, hjust = 1,
face = "italic"),
axis.text.y = element_text(size = 12, face = "italic"),
strip.text.x = element_text(size = 14),
strip.text.y = element_text(size = 14),
legend.text = element_text(size = 12),
plot.title = element_text(hjust = 0.5, size = 15),
panel.grid.major = element_blank(),
axis.ticks = element_blank(),
legend.position = "none") +
coord_fixed()
heat_linear_fl
corr_dist = res_dist$dcorr_fl
cooccur_dist = res_dist$mat_cooccur
# Filter by co-occurrence
overlap = 10
corr_dist[cooccur_dist < overlap] = 0
df_dist = data.frame(get_upper_tri(corr_dist)) %>%
rownames_to_column("var1") %>%
pivot_longer(cols = -var1, names_to = "var2", values_to = "value") %>%
filter(!is.na(value)) %>%
mutate(value = round(value, 2))
tax_name = sort(union(df_dist$var1, df_dist$var2))
df_dist$var1 = factor(df_dist$var1, levels = tax_name)
df_dist$var2 = factor(df_dist$var2, levels = tax_name)
heat_dist_fl = df_dist %>%
ggplot(aes(var2, var1, fill = value)) +
geom_tile(color = "black") +
scale_fill_gradient2(low = "blue", high = "red", mid = "white", na.value = "grey",
midpoint = 0, limit = c(-1,1), space = "Lab",
name = NULL) +
scale_x_discrete(drop = FALSE) +
scale_y_discrete(drop = FALSE) +
geom_text(aes(var2, var1, label = value), color = "black", size = 4) +
labs(x = NULL, y = NULL, title = "Distance (Filtering)") +
theme_bw() +
theme(axis.text.x = element_text(angle = 45, vjust = 1, size = 12, hjust = 1,
face = "italic"),
axis.text.y = element_text(size = 12, face = "italic"),
strip.text.x = element_text(size = 14),
strip.text.y = element_text(size = 14),
legend.text = element_text(size = 12),
plot.title = element_text(hjust = 0.5, size = 15),
panel.grid.major = element_blank(),
axis.ticks = element_blank(),
legend.position = "none") +
coord_fixed()
heat_dist_fl
To compute correlations whithin and across different ecosystems, one needs to make sure that there are samples in common across these ecosystems.
# Select subjects from "CE" and "NE"
tse1 = tse[, tse$region == "CE"]
tse2 = tse[, tse$region == "NE"]
# Rename samples to ensure there is an overlap of samples between CE and NE
colnames(tse1) = paste0("Sample-", seq_len(ncol(tse1)))
colnames(tse2) = paste0("Sample-", seq_len(ncol(tse2)))
print(tse1)
class: TreeSummarizedExperiment
dim: 130 578
metadata(0):
assays(1): counts
rownames(130): Actinomycetaceae Aerococcus ... Xanthomonadaceae
Yersinia et rel.
rowData names(3): Phylum Family Genus
colnames(578): Sample-1 Sample-2 ... Sample-577 Sample-578
colData names(12): age sex ... bmi region
reducedDimNames(0):
mainExpName: NULL
altExpNames(0):
rowLinks: NULL
rowTree: NULL
colLinks: NULL
colTree: NULL
class: TreeSummarizedExperiment
dim: 130 181
metadata(0):
assays(1): counts
rownames(130): Actinomycetaceae Aerococcus ... Xanthomonadaceae
Yersinia et rel.
rowData names(3): Phylum Family Genus
colnames(181): Sample-1 Sample-2 ... Sample-180 Sample-181
colData names(12): age sex ... bmi region
reducedDimNames(0):
mainExpName: NULL
altExpNames(0):
rowLinks: NULL
rowTree: NULL
colLinks: NULL
colTree: NULL
set.seed(123)
# Linear relationships
res_linear = secom_linear(data = list(CE = tse1, NE = tse2),
assay_name = c("counts", "counts"),
tax_level = c("Phylum", "Phylum"), pseudo = 0,
prv_cut = 0.5, lib_cut = 1000, corr_cut = 0.5,
wins_quant = c(0.05, 0.95), method = "pearson",
soft = FALSE, thresh_len = 20, n_cv = 10,
thresh_hard = 0.3, max_p = 0.005, n_cl = 2)
# Nonlinear relationships
res_dist = secom_dist(data = list(CE = tse1, NE = tse2),
assay_name = c("counts", "counts"),
tax_level = c("Phylum", "Phylum"), pseudo = 0,
prv_cut = 0.5, lib_cut = 1000, corr_cut = 0.5,
wins_quant = c(0.05, 0.95), R = 1000,
thresh_hard = 0.3, max_p = 0.005, n_cl = 2)
corr_linear = res_linear$corr_th
cooccur_linear = res_linear$mat_cooccur
# Filter by co-occurrence
overlap = 10
corr_linear[cooccur_linear < overlap] = 0
df_linear = data.frame(get_upper_tri(corr_linear)) %>%
rownames_to_column("var1") %>%
pivot_longer(cols = -var1, names_to = "var2", values_to = "value") %>%
filter(!is.na(value)) %>%
mutate(var2 = gsub("\\...", " - ", var2),
value = round(value, 2))
tax_name = sort(union(df_linear$var1, df_linear$var2))
df_linear$var1 = factor(df_linear$var1, levels = tax_name)
df_linear$var2 = factor(df_linear$var2, levels = tax_name)
txt_color = ifelse(grepl("CE", tax_name), "#1B9E77", "#D95F02")
heat_linear_th = df_linear %>%
ggplot(aes(var2, var1, fill = value)) +
geom_tile(color = "black") +
scale_fill_gradient2(low = "blue", high = "red", mid = "white",
na.value = "grey", midpoint = 0, limit = c(-1,1),
space = "Lab", name = NULL) +
scale_x_discrete(drop = FALSE) +
scale_y_discrete(drop = FALSE) +
geom_text(aes(var2, var1, label = value), color = "black", size = 4) +
labs(x = NULL, y = NULL, title = "Pearson (Thresholding)") +
theme_bw() +
geom_vline(xintercept = 6.5, color = "blue", linetype = "dashed") +
geom_hline(yintercept = 6.5, color = "blue", linetype = "dashed") +
theme(axis.text.x = element_text(angle = 45, vjust = 1, size = 12, hjust = 1,
face = "italic", color = txt_color),
axis.text.y = element_text(size = 12, face = "italic",
color = txt_color),
strip.text.x = element_text(size = 14),
strip.text.y = element_text(size = 14),
legend.text = element_text(size = 12),
plot.title = element_text(hjust = 0.5, size = 15),
panel.grid.major = element_blank(),
axis.ticks = element_blank(),
legend.position = "none") +
coord_fixed()
heat_linear_th
corr_linear = res_linear$corr_th
cooccur_linear = res_linear$mat_cooccur
# Filter by co-occurrence
overlap = 10
corr_linear[cooccur_linear < overlap] = 0
df_linear = data.frame(get_upper_tri(corr_linear)) %>%
rownames_to_column("var1") %>%
pivot_longer(cols = -var1, names_to = "var2", values_to = "value") %>%
filter(!is.na(value)) %>%
mutate(var2 = gsub("\\...", " - ", var2),
value = round(value, 2))
tax_name = sort(union(df_linear$var1, df_linear$var2))
df_linear$var1 = factor(df_linear$var1, levels = tax_name)
df_linear$var2 = factor(df_linear$var2, levels = tax_name)
txt_color = ifelse(grepl("CE", tax_name), "#1B9E77", "#D95F02")
heat_linear_fl = df_linear %>%
ggplot(aes(var2, var1, fill = value)) +
geom_tile(color = "black") +
scale_fill_gradient2(low = "blue", high = "red", mid = "white",
na.value = "grey", midpoint = 0, limit = c(-1,1),
space = "Lab", name = NULL) +
scale_x_discrete(drop = FALSE) +
scale_y_discrete(drop = FALSE) +
geom_text(aes(var2, var1, label = value), color = "black", size = 4) +
labs(x = NULL, y = NULL, title = "Pearson (Filtering)") +
theme_bw() +
geom_vline(xintercept = 6.5, color = "blue", linetype = "dashed") +
geom_hline(yintercept = 6.5, color = "blue", linetype = "dashed") +
theme(axis.text.x = element_text(angle = 45, vjust = 1, size = 12, hjust = 1,
face = "italic", color = txt_color),
axis.text.y = element_text(size = 12, face = "italic",
color = txt_color),
strip.text.x = element_text(size = 14),
strip.text.y = element_text(size = 14),
legend.text = element_text(size = 12),
plot.title = element_text(hjust = 0.5, size = 15),
panel.grid.major = element_blank(),
axis.ticks = element_blank(),
legend.position = "none") +
coord_fixed()
heat_linear_fl
corr_dist = res_dist$dcorr_fl
cooccur_dist = res_dist$mat_cooccur
# Filter by co-occurrence
overlap = 10
corr_dist[cooccur_dist < overlap] = 0
df_dist = data.frame(get_upper_tri(corr_dist)) %>%
rownames_to_column("var1") %>%
pivot_longer(cols = -var1, names_to = "var2", values_to = "value") %>%
filter(!is.na(value)) %>%
mutate(var2 = gsub("\\...", " - ", var2),
value = round(value, 2))
tax_name = sort(union(df_dist$var1, df_dist$var2))
df_dist$var1 = factor(df_dist$var1, levels = tax_name)
df_dist$var2 = factor(df_dist$var2, levels = tax_name)
txt_color = ifelse(grepl("CE", tax_name), "#1B9E77", "#D95F02")
heat_dist_fl = df_dist %>%
ggplot(aes(var2, var1, fill = value)) +
geom_tile(color = "black") +
scale_fill_gradient2(low = "blue", high = "red", mid = "white",
na.value = "grey", midpoint = 0, limit = c(-1,1),
space = "Lab", name = NULL) +
scale_x_discrete(drop = FALSE) +
scale_y_discrete(drop = FALSE) +
geom_text(aes(var2, var1, label = value), color = "black", size = 4) +
labs(x = NULL, y = NULL, title = "Distance (Filtering)") +
theme_bw() +
geom_vline(xintercept = 6.5, color = "blue", linetype = "dashed") +
geom_hline(yintercept = 6.5, color = "blue", linetype = "dashed") +
theme(axis.text.x = element_text(angle = 45, vjust = 1, size = 12, hjust = 1,
face = "italic", color = txt_color),
axis.text.y = element_text(size = 12, face = "italic",
color = txt_color),
strip.text.x = element_text(size = 14),
strip.text.y = element_text(size = 14),
legend.text = element_text(size = 12),
plot.title = element_text(hjust = 0.5, size = 15),
panel.grid.major = element_blank(),
axis.ticks = element_blank(),
legend.position = "none") +
coord_fixed()
heat_dist_fl
R version 4.3.0 RC (2023-04-13 r84269)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Ubuntu 22.04.2 LTS
Matrix products: default
BLAS: /home/biocbuild/bbs-3.17-bioc/R/lib/libRblas.so
LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.10.0
locale:
[1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
[3] LC_TIME=en_GB LC_COLLATE=C
[5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
[7] LC_PAPER=en_US.UTF-8 LC_NAME=C
[9] LC_ADDRESS=C LC_TELEPHONE=C
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
time zone: America/New_York
tzcode source: system (glibc)
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] caret_6.0-94 lattice_0.21-8 doRNG_1.8.6 rngtools_1.5.2
[5] foreach_1.5.2 DT_0.27 phyloseq_1.44.0 lubridate_1.9.2
[9] forcats_1.0.0 stringr_1.5.0 dplyr_1.1.2 purrr_1.0.1
[13] readr_2.1.4 tidyr_1.3.0 tibble_3.2.1 ggplot2_3.4.2
[17] tidyverse_2.0.0 ANCOMBC_2.2.0
loaded via a namespace (and not attached):
[1] matrixStats_0.63.0 bitops_1.0-7
[3] DirichletMultinomial_1.42.0 httr_1.4.5
[5] RColorBrewer_1.1-3 doParallel_1.0.17
[7] numDeriv_2016.8-1.1 tools_4.3.0
[9] backports_1.4.1 utf8_1.2.3
[11] R6_2.5.1 vegan_2.6-4
[13] lazyeval_0.2.2 mgcv_1.8-42
[15] rhdf5filters_1.12.0 permute_0.9-7
[17] withr_2.5.0 gridExtra_2.3
[19] cli_3.6.1 Biobase_2.60.0
[21] sandwich_3.0-2 labeling_0.4.2
[23] sass_0.4.5 mvtnorm_1.1-3
[25] proxy_0.4-27 yulab.utils_0.0.6
[27] foreign_0.8-84 scater_1.28.0
[29] decontam_1.20.0 parallelly_1.35.0
[31] readxl_1.4.2 rstudioapi_0.14
[33] RSQLite_2.3.1 generics_0.1.3
[35] crosstalk_1.2.0 Matrix_1.5-4
[37] biomformat_1.28.0 ggbeeswarm_0.7.1
[39] fansi_1.0.4 DescTools_0.99.48
[41] S4Vectors_0.38.0 DECIPHER_2.28.0
[43] lifecycle_1.0.3 multcomp_1.4-23
[45] yaml_2.3.7 SummarizedExperiment_1.30.0
[47] rhdf5_2.44.0 recipes_1.0.6
[49] grid_4.3.0 blob_1.2.4
[51] crayon_1.5.2 beachmat_2.16.0
[53] pillar_1.9.0 knitr_1.42
[55] GenomicRanges_1.52.0 boot_1.3-28.1
[57] gld_2.6.6 estimability_1.4.1
[59] future.apply_1.10.0 codetools_0.2-19
[61] glue_1.6.2 data.table_1.14.8
[63] MultiAssayExperiment_1.26.0 vctrs_0.6.2
[65] treeio_1.24.0 Rdpack_2.4
[67] cellranger_1.1.0 gtable_0.3.3
[69] cachem_1.0.7 gower_1.0.1
[71] xfun_0.39 rbibutils_2.2.13
[73] prodlim_2023.03.31 coda_0.19-4
[75] survival_3.5-5 timeDate_4022.108
[77] SingleCellExperiment_1.22.0 iterators_1.0.14
[79] hardhat_1.3.0 lava_1.7.2.1
[81] gmp_0.7-1 ellipsis_0.3.2
[83] TH.data_1.1-2 ipred_0.9-14
[85] nlme_3.1-162 bit64_4.0.5
[87] GenomeInfoDb_1.36.0 bslib_0.4.2
[89] irlba_2.3.5.1 vipor_0.4.5
[91] rpart_4.1.19 colorspace_2.1-0
[93] BiocGenerics_0.46.0 DBI_1.1.3
[95] Hmisc_5.0-1 nnet_7.3-18
[97] ade4_1.7-22 Exact_3.2
[99] tidyselect_1.2.0 emmeans_1.8.5
[101] bit_4.0.5 compiler_4.3.0
[103] htmlTable_2.4.1 BiocNeighbors_1.18.0
[105] expm_0.999-7 DelayedArray_0.26.0
[107] checkmate_2.1.0 scales_1.2.1
[109] digest_0.6.31 minqa_1.2.5
[111] rmarkdown_2.21 XVector_0.40.0
[113] htmltools_0.5.5 pkgconfig_2.0.3
[115] base64enc_0.1-3 lme4_1.1-33
[117] sparseMatrixStats_1.12.0 MatrixGenerics_1.12.0
[119] highr_0.10 fastmap_1.1.1
[121] rlang_1.1.0 htmlwidgets_1.6.2
[123] DelayedMatrixStats_1.22.0 farver_2.1.1
[125] jquerylib_0.1.4 zoo_1.8-12
[127] jsonlite_1.8.4 energy_1.7-11
[129] BiocParallel_1.34.0 ModelMetrics_1.2.2.2
[131] BiocSingular_1.16.0 RCurl_1.98-1.12
[133] magrittr_2.0.3 Formula_1.2-5
[135] scuttle_1.10.0 GenomeInfoDbData_1.2.10
[137] Rhdf5lib_1.22.0 munsell_0.5.0
[139] Rcpp_1.0.10 ape_5.7-1
[141] viridis_0.6.2 CVXR_1.0-11
[143] stringi_1.7.12 pROC_1.18.0
[145] rootSolve_1.8.2.3 zlibbioc_1.46.0
[147] MASS_7.3-59 plyr_1.8.8
[149] parallel_4.3.0 listenv_0.9.0
[151] ggrepel_0.9.3 lmom_2.9
[153] Biostrings_2.68.0 splines_4.3.0
[155] multtest_2.56.0 hms_1.1.3
[157] igraph_1.4.2 reshape2_1.4.4
[159] stats4_4.3.0 ScaledMatrix_1.8.0
[161] evaluate_0.20 nloptr_2.0.3
[163] tzdb_0.3.0 future_1.32.0
[165] rsvd_1.0.5 xtable_1.8-4
[167] Rmpfr_0.9-2 e1071_1.7-13
[169] tidytree_0.4.2 viridisLite_0.4.1
[171] class_7.3-21 gsl_2.1-8
[173] lmerTest_3.1-3 memoise_2.0.1
[175] beeswarm_0.4.0 IRanges_2.34.0
[177] cluster_2.1.4 TreeSummarizedExperiment_2.8.0
[179] timechange_0.2.0 globals_0.16.2
[181] mia_1.8.0
Lahti, Leo, Jarkko Salojärvi, Anne Salonen, Marten Scheffer, and Willem M De Vos. 2014. “Tipping Elements in the Human Intestinal Ecosystem.” Nature Communications 5 (1): 1–10.
Lahti, Leo, Sudarshan Shetty, T Blake, J Salojarvi, and others. 2017. “Tools for Microbiome Analysis in R.” Version 1: 10013.
Lin, Huang, Merete Eggesbø, and Shyamal Das Peddada. 2022. “Linear and Nonlinear Correlation Estimators Unveil Undescribed Taxa Interactions in Microbiome Data.” Nature Communications 13 (1): 1–16.
McMurdie, Paul J, and Susan Holmes. 2013. “Phyloseq: An R Package for Reproducible Interactive Analysis and Graphics of Microbiome Census Data.” PloS One 8 (4): e61217.