vignettes/MAIN_FGSEA.Rmd
MAIN_FGSEA.Rmd
library(MotrpacRatTraining6moWATData)
library(MotrpacRatTraining6moWAT)
library(dplyr)
library(tidyr)
library(tibble)
library(purrr)
Fast Gene Set Enrichment Analysis (FGSEA)[1] will be performed
using Entrez gene identifiers for proteomics and transcriptomics and
RefMet metabolite identifiers for metabolomics. Gene Ontology gene sets
will be used for proteomics and phosphoproteomics, while RefMet chemical
subclasses will be used to group metabolites into metabolite sets. We
assume that the majority of the white adipose tissue (WAT) proteome and
transcriptome were captured by our datasets, so we only keep gene sets
that retain the majority of their genes after filtering to what was
present in our -omics datasets. That is, only those gene sets with at
least 85% of their original genes will be used for testing. Gene sets
will be filtered further based on size so that they must contain at
least 15 genes. For metabolomics, a minimum of 10 metabolites per
chemical subclass are required, though a membership filter will not be
applied. Sets contain no more than 300 members due to pre-filtering of
MSIGDB_PATHWAYS
(see help("MSIGDB_PATHWAYS")
for details).
There is a one-to-many mapping between transcripts and Entrez genes, so we will need to account for this.
table(grepl(";", TRNSCRPT_DA$MvF_SED$entrez_gene))
#>
#> FALSE TRUE
#> 16227 177
# 177 transcripts have multiple genes. Separate to multiple rows
TRNSCRPT_DA_SEP <- TRNSCRPT_DA %>%
map(.f = ~ separate_rows(.x, entrez_gene, gene_symbol, sep = ";") %>%
filter(entrez_gene != "NA"))
# Entrez to gene symbol conversion vector for leading edge
TRNSCRPT_entrez_to_symbol <- TRNSCRPT_DA_SEP$MvF_SED %>%
dplyr::select(entrez_gene, gene_symbol) %>%
distinct() %>%
deframe()
head(TRNSCRPT_entrez_to_symbol)
#> 24379 296511 498922 296272 297738 362454
#> "Gad1" "Alx4" "Cbln1" "Tcf15" "Steap1" "Hebp1"
# Select gene sets that are largely unchanged when filtering
# to only those Entrez IDs present in the results.
TRNSCRPT_MSIGDB <- MSIGDB_PATHWAYS %>%
mutate(entrez_gene = map(.x = entrez_gene, .f = intersect,
y = names(TRNSCRPT_entrez_to_symbol)),
set_size_post = lengths(entrez_gene),
ratio = set_size_post / set_size) %>%
filter(ratio >= 0.85, # at least 85% of the original set remains
set_size_post >= 15)
table(TRNSCRPT_MSIGDB$gs_subcat) # how many gene sets remain?
#>
#> GO:BP GO:CC GO:MF
#> 2715 320 480
# GO:BP GO:CC GO:MF
# 2864 347 498
# FGSEA
TRNSCRPT_FGSEA <- map(TRNSCRPT_DA_SEP, function(res_i) {
fgsea2(pathways = TRNSCRPT_MSIGDB,
stats = rank_genes(res_i, genes = "entrez_gene"),
seed = 0, nPermSimple = 10000,
adjust.globally = TRUE, nproc = 1) %>%
# Map Entrez IDs in leading edge subset to gene symbols
mutate(leadingEdge_genes = map(
.x = leadingEdge,
.f = ~ na.omit(TRNSCRPT_entrez_to_symbol[as.character(.x)])
)) %>%
# Reorder columns
dplyr::select(pathway, gs_subcat, gs_description, everything()) %>%
relocate(contrast, .before = leadingEdge)
})
# Entrez to gene symbol conversion vector for leading edge
PROT_entrez_to_symbol <- fData(PROT_EXP) %>%
dplyr::select(entrez_gene, gene_symbol) %>%
distinct() %>%
deframe()
head(PROT_entrez_to_symbol)
#> 26194 26195 26198 26196 26204 26199
#> "Mt-nd2" "Mt-co1" "Mt-co2" "Mt-atp8" "Mt-co3" "Mt-nd3"
# Select gene sets that are largely unchanged when filtering
# to only those Entrez IDs present in the results.
PROT_MSIGDB <- MSIGDB_PATHWAYS %>%
mutate(entrez_gene = map(.x = entrez_gene, .f = intersect,
y = names(PROT_entrez_to_symbol)),
set_size_post = lengths(entrez_gene),
ratio = set_size_post / set_size) %>%
filter(ratio >= 0.85, # at least 85% of the original set remains
set_size_post >= 15)
table(PROT_MSIGDB$gs_subcat) # how many gene sets remain?
#>
#> GO:BP GO:CC GO:MF
#> 230 97 77
# GO:BP GO:CC GO:MF
# 234 99 78
# FGSEA
PROT_FGSEA <- map(PROT_DA, function(res_i) {
fgsea2(pathways = PROT_MSIGDB,
stats = rank_genes(res_i, genes = "entrez_gene"),
seed = 0, nPermSimple = 10000,
adjust.globally = TRUE, nproc = 1) %>%
# Map Entrez IDs in leading edge subset to gene symbols
mutate(leadingEdge_genes = map(
.x = leadingEdge,
.f = ~ na.omit(PROT_entrez_to_symbol[as.character(.x)])
)) %>%
# Reorder columns
dplyr::select(pathway, gs_subcat, gs_description, everything()) %>%
relocate(contrast, .before = leadingEdge)
})
Unlike with proteomics and transcriptomics, we are not limited to testing terms that are largely unchanged after filtering. This is because the RefMet chemical subclasses are homogenous groups (e.g., subsetting the acyl carnitines will still result in a group of only acyl carnitines).
# Reformat fData for use with fgsea2
REFMET_SUBCLASSES <- fData(METAB_EXP) %>%
group_by(refmet_sub_class) %>%
summarise(feature = list(feature_ID)) %>%
mutate(gs_subcat = "refmet_sub_class",
gs_exact_source = refmet_sub_class,
gs_description = refmet_sub_class,
set_size = lengths(feature)) %>%
filter(set_size >= 10)
nrow(REFMET_SUBCLASSES) # 19
#> [1] 19
# FGSEA
METAB_FGSEA <- map(METAB_DA, function(res_i) {
fgsea2(pathways = REFMET_SUBCLASSES,
gene_column = "feature",
stats = rank_genes(res_i, genes = "feature"),
seed = 0, nPermSimple = 10000,
adjust.globally = TRUE, nproc = 1) %>%
# Reorder columns
dplyr::rename(refmet_sub_class = pathway) %>%
dplyr::select(-starts_with("gs_")) %>%
relocate(contrast, .before = leadingEdge)
})
# Save
usethis::use_data(TRNSCRPT_FGSEA, internal = FALSE, overwrite = TRUE,
version = 3, compress = "bzip2")
usethis::use_data(PROT_FGSEA, internal = FALSE, overwrite = TRUE,
version = 3, compress = "bzip2")
usethis::use_data(METAB_FGSEA, internal = FALSE, overwrite = TRUE,
version = 3, compress = "bzip2")
sessionInfo()
#> R version 4.4.0 (2024-04-24)
#> Platform: x86_64-pc-linux-gnu
#> Running under: Ubuntu 22.04.4 LTS
#>
#> Matrix products: default
#> BLAS: /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3
#> LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.20.so; LAPACK version 3.10.0
#>
#> locale:
#> [1] LC_CTYPE=C.UTF-8 LC_NUMERIC=C LC_TIME=C.UTF-8
#> [4] LC_COLLATE=C.UTF-8 LC_MONETARY=C.UTF-8 LC_MESSAGES=C.UTF-8
#> [7] LC_PAPER=C.UTF-8 LC_NAME=C LC_ADDRESS=C
#> [10] LC_TELEPHONE=C LC_MEASUREMENT=C.UTF-8 LC_IDENTIFICATION=C
#>
#> time zone: UTC
#> tzcode source: system (glibc)
#>
#> attached base packages:
#> [1] stats graphics grDevices utils datasets methods base
#>
#> other attached packages:
#> [1] purrr_1.0.2 tibble_3.2.1
#> [3] tidyr_1.3.1 dplyr_1.1.4
#> [5] MotrpacRatTraining6moWAT_1.0.1 Biobase_2.64.0
#> [7] BiocGenerics_0.50.0 MotrpacRatTraining6moWATData_2.0.0
#>
#> loaded via a namespace (and not attached):
#> [1] RColorBrewer_1.1-3 rstudioapi_0.16.0 jsonlite_1.8.8
#> [4] shape_1.4.6.1 magrittr_2.0.3 ggbeeswarm_0.7.2
#> [7] rmarkdown_2.26 GlobalOptions_0.1.2 fs_1.6.4
#> [10] zlibbioc_1.50.0 ragg_1.3.0 vctrs_0.6.5
#> [13] memoise_2.0.1 base64enc_0.1-3 rstatix_0.7.2
#> [16] htmltools_0.5.8.1 dynamicTreeCut_1.63-1 curl_5.2.1
#> [19] broom_1.0.5 Formula_1.2-5 sass_0.4.9
#> [22] bslib_0.7.0 htmlwidgets_1.6.4 desc_1.4.3
#> [25] impute_1.78.0 cachem_1.0.8 lifecycle_1.0.4
#> [28] iterators_1.0.14 pkgconfig_2.0.3 Matrix_1.7-0
#> [31] R6_2.5.1 fastmap_1.1.1 GenomeInfoDbData_1.2.12
#> [34] clue_0.3-65 digest_0.6.35 colorspace_2.1-0
#> [37] patchwork_1.2.0 AnnotationDbi_1.65.2 S4Vectors_0.42.0
#> [40] textshaping_0.3.7 Hmisc_5.1-2 RSQLite_2.3.6
#> [43] ggpubr_0.6.0 filelock_1.0.3 latex2exp_0.9.6
#> [46] fansi_1.0.6 httr_1.4.7 abind_1.4-5
#> [49] compiler_4.4.0 withr_3.0.0 bit64_4.0.5
#> [52] doParallel_1.0.17 htmlTable_2.4.2 backports_1.4.1
#> [55] BiocParallel_1.38.0 carData_3.0-5 DBI_1.2.2
#> [58] ggsignif_0.6.4 rjson_0.2.21 tools_4.4.0
#> [61] vipor_0.4.7 foreign_0.8-86 beeswarm_0.4.0
#> [64] msigdbr_7.5.1 nnet_7.3-19 glue_1.7.0
#> [67] grid_4.4.0 checkmate_2.3.1 cluster_2.1.6
#> [70] fgsea_1.30.0 generics_0.1.3 gtable_0.3.5
#> [73] preprocessCore_1.66.0 data.table_1.15.4 WGCNA_1.72-5
#> [76] car_3.1-2 utf8_1.2.4 XVector_0.44.0
#> [79] foreach_1.5.2 pillar_1.9.0 stringr_1.5.1
#> [82] babelgene_22.9 limma_3.60.0 circlize_0.4.16
#> [85] splines_4.4.0 BiocFileCache_2.12.0 lattice_0.22-6
#> [88] survival_3.5-8 bit_4.0.5 tidyselect_1.2.1
#> [91] GO.db_3.19.1 ComplexHeatmap_2.20.0 locfit_1.5-9.9
#> [94] Biostrings_2.72.0 knitr_1.46 gridExtra_2.3
#> [97] IRanges_2.38.0 edgeR_4.2.0 stats4_4.4.0
#> [100] xfun_0.43 statmod_1.5.0 matrixStats_1.3.0
#> [103] stringi_1.8.3 UCSC.utils_1.0.0 yaml_2.3.8
#> [106] evaluate_0.23 codetools_0.2-20 cli_3.6.2
#> [109] ontologyIndex_2.12 rpart_4.1.23 systemfonts_1.0.6
#> [112] munsell_0.5.1 jquerylib_0.1.4 Rcpp_1.0.12
#> [115] GenomeInfoDb_1.40.0 dbplyr_2.5.0 png_0.1-8
#> [118] fastcluster_1.2.6 parallel_4.4.0 pkgdown_2.0.9
#> [121] ggplot2_3.5.1 blob_1.2.4 scales_1.3.0
#> [124] crayon_1.5.2 GetoptLong_1.0.5 rlang_1.1.3
#> [127] cowplot_1.1.3 fastmatch_1.1-4 KEGGREST_1.44.0