FGSEA of Omics Datasets

library(MotrpacRatTraining6moWATData)
library(MotrpacRatTraining6moWAT)
library(dplyr)
library(tidyr)
library(tibble)
library(purrr)

Overview

Fast Gene Set Enrichment Analysis (FGSEA)^[1] will be performed using Entrez gene identifiers for proteomics and transcriptomics and RefMet metabolite identifiers for metabolomics. Gene Ontology gene sets will be used for proteomics and phosphoproteomics, while RefMet chemical subclasses will be used to group metabolites into metabolite sets. We assume that the majority of the white adipose tissue (WAT) proteome and transcriptome were captured by our datasets, so we only keep gene sets that retain the majority of their genes after filtering to what was present in our -omics datasets. That is, only those gene sets with at least 85% of their original genes will be used for testing. Gene sets will be filtered further based on size so that they must contain at least 15 genes. For metabolomics, a minimum of 10 metabolites per chemical subclass are required, though a membership filter will not be applied. Sets contain no more than 300 members due to pre-filtering of MSIGDB_PATHWAYS (see help("MSIGDB_PATHWAYS") for details).

Transcriptomics

There is a one-to-many mapping between transcripts and Entrez genes, so we will need to account for this.

table(grepl(";", TRNSCRPT_DA$MvF_SED$entrez_gene))
#> 
#> FALSE  TRUE 
#> 16227   177
# 177 transcripts have multiple genes. Separate to multiple rows

TRNSCRPT_DA_SEP <- TRNSCRPT_DA %>%
  map(.f = ~ separate_rows(.x, entrez_gene, gene_symbol, sep = ";") %>%
        filter(entrez_gene != "NA"))

# Entrez to gene symbol conversion vector for leading edge
TRNSCRPT_entrez_to_symbol <- TRNSCRPT_DA_SEP$MvF_SED %>%
  dplyr::select(entrez_gene, gene_symbol) %>%
  distinct() %>%
  deframe()
head(TRNSCRPT_entrez_to_symbol)
#>    24379   296511   498922   296272   297738   362454 
#>   "Gad1"   "Alx4"  "Cbln1"  "Tcf15" "Steap1"  "Hebp1"

# Select gene sets that are largely unchanged when filtering
# to only those Entrez IDs present in the results.
TRNSCRPT_MSIGDB <- MSIGDB_PATHWAYS %>%
  mutate(entrez_gene = map(.x = entrez_gene, .f = intersect,
                           y = names(TRNSCRPT_entrez_to_symbol)),
         set_size_post = lengths(entrez_gene),
         ratio = set_size_post / set_size) %>%
  filter(ratio >= 0.85, # at least 85% of the original set remains
         set_size_post >= 15)

table(TRNSCRPT_MSIGDB$gs_subcat) # how many gene sets remain?
#> 
#> GO:BP GO:CC GO:MF 
#>  2715   320   480
# GO:BP GO:CC GO:MF
#  2864   347   498

# FGSEA
TRNSCRPT_FGSEA <- map(TRNSCRPT_DA_SEP, function(res_i) {
  fgsea2(pathways = TRNSCRPT_MSIGDB,
         stats = rank_genes(res_i, genes = "entrez_gene"),
         seed = 0, nPermSimple = 10000,
         adjust.globally = TRUE, nproc = 1) %>%
    # Map Entrez IDs in leading edge subset to gene symbols
    mutate(leadingEdge_genes = map(
      .x = leadingEdge,
      .f = ~ na.omit(TRNSCRPT_entrez_to_symbol[as.character(.x)])
    )) %>%
    # Reorder columns
    dplyr::select(pathway, gs_subcat, gs_description, everything()) %>%
    relocate(contrast, .before = leadingEdge)
})

Proteomics

# Entrez to gene symbol conversion vector for leading edge
PROT_entrez_to_symbol <- fData(PROT_EXP) %>%
  dplyr::select(entrez_gene, gene_symbol) %>%
  distinct() %>%
  deframe()
head(PROT_entrez_to_symbol)
#>     26194     26195     26198     26196     26204     26199 
#>  "Mt-nd2"  "Mt-co1"  "Mt-co2" "Mt-atp8"  "Mt-co3"  "Mt-nd3"

# Select gene sets that are largely unchanged when filtering
# to only those Entrez IDs present in the results.
PROT_MSIGDB <- MSIGDB_PATHWAYS %>%
  mutate(entrez_gene = map(.x = entrez_gene, .f = intersect,
                           y = names(PROT_entrez_to_symbol)),
         set_size_post = lengths(entrez_gene),
         ratio = set_size_post / set_size) %>%
  filter(ratio >= 0.85, # at least 85% of the original set remains
         set_size_post >= 15)

table(PROT_MSIGDB$gs_subcat) # how many gene sets remain?
#> 
#> GO:BP GO:CC GO:MF 
#>   230    97    77
# GO:BP GO:CC GO:MF
#   234    99    78

# FGSEA
PROT_FGSEA <- map(PROT_DA, function(res_i) {
  fgsea2(pathways = PROT_MSIGDB,
         stats = rank_genes(res_i, genes = "entrez_gene"),
         seed = 0, nPermSimple = 10000,
         adjust.globally = TRUE, nproc = 1) %>%
    # Map Entrez IDs in leading edge subset to gene symbols
    mutate(leadingEdge_genes = map(
      .x = leadingEdge,
      .f = ~ na.omit(PROT_entrez_to_symbol[as.character(.x)])
    )) %>%
    # Reorder columns
    dplyr::select(pathway, gs_subcat, gs_description, everything()) %>%
    relocate(contrast, .before = leadingEdge)
})

Metabolomics

Unlike with proteomics and transcriptomics, we are not limited to testing terms that are largely unchanged after filtering. This is because the RefMet chemical subclasses are homogenous groups (e.g., subsetting the acyl carnitines will still result in a group of only acyl carnitines).

# Reformat fData for use with fgsea2
REFMET_SUBCLASSES <- fData(METAB_EXP) %>%
  group_by(refmet_sub_class) %>%
  summarise(feature = list(feature_ID)) %>%
  mutate(gs_subcat = "refmet_sub_class",
         gs_exact_source = refmet_sub_class,
         gs_description = refmet_sub_class,
         set_size = lengths(feature)) %>%
  filter(set_size >= 10)

nrow(REFMET_SUBCLASSES) # 19
#> [1] 19

# FGSEA
METAB_FGSEA <- map(METAB_DA, function(res_i) {
  fgsea2(pathways = REFMET_SUBCLASSES,
         gene_column = "feature",
         stats = rank_genes(res_i, genes = "feature"),
         seed = 0, nPermSimple = 10000,
         adjust.globally = TRUE, nproc = 1) %>%
    # Reorder columns
    dplyr::rename(refmet_sub_class = pathway) %>%
    dplyr::select(-starts_with("gs_")) %>%
    relocate(contrast, .before = leadingEdge)
})

# Save
usethis::use_data(TRNSCRPT_FGSEA, internal = FALSE, overwrite = TRUE,
                  version = 3, compress = "bzip2")

usethis::use_data(PROT_FGSEA, internal = FALSE, overwrite = TRUE,
                  version = 3, compress = "bzip2")

usethis::use_data(METAB_FGSEA, internal = FALSE, overwrite = TRUE,
                  version = 3, compress = "bzip2")

Session Info

sessionInfo()
#> R version 4.4.0 (2024-04-24)
#> Platform: x86_64-pc-linux-gnu
#> Running under: Ubuntu 22.04.4 LTS
#> 
#> Matrix products: default
#> BLAS:   /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3 
#> LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.20.so;  LAPACK version 3.10.0
#> 
#> locale:
#>  [1] LC_CTYPE=C.UTF-8       LC_NUMERIC=C           LC_TIME=C.UTF-8       
#>  [4] LC_COLLATE=C.UTF-8     LC_MONETARY=C.UTF-8    LC_MESSAGES=C.UTF-8   
#>  [7] LC_PAPER=C.UTF-8       LC_NAME=C              LC_ADDRESS=C          
#> [10] LC_TELEPHONE=C         LC_MEASUREMENT=C.UTF-8 LC_IDENTIFICATION=C   
#> 
#> time zone: UTC
#> tzcode source: system (glibc)
#> 
#> attached base packages:
#> [1] stats     graphics  grDevices utils     datasets  methods   base     
#> 
#> other attached packages:
#> [1] purrr_1.0.2                        tibble_3.2.1                      
#> [3] tidyr_1.3.1                        dplyr_1.1.4                       
#> [5] MotrpacRatTraining6moWAT_1.0.1     Biobase_2.64.0                    
#> [7] BiocGenerics_0.50.0                MotrpacRatTraining6moWATData_2.0.0
#> 
#> loaded via a namespace (and not attached):
#>   [1] RColorBrewer_1.1-3      rstudioapi_0.16.0       jsonlite_1.8.8         
#>   [4] shape_1.4.6.1           magrittr_2.0.3          ggbeeswarm_0.7.2       
#>   [7] rmarkdown_2.26          GlobalOptions_0.1.2     fs_1.6.4               
#>  [10] zlibbioc_1.50.0         ragg_1.3.0              vctrs_0.6.5            
#>  [13] memoise_2.0.1           base64enc_0.1-3         rstatix_0.7.2          
#>  [16] htmltools_0.5.8.1       dynamicTreeCut_1.63-1   curl_5.2.1             
#>  [19] broom_1.0.5             Formula_1.2-5           sass_0.4.9             
#>  [22] bslib_0.7.0             htmlwidgets_1.6.4       desc_1.4.3             
#>  [25] impute_1.78.0           cachem_1.0.8            lifecycle_1.0.4        
#>  [28] iterators_1.0.14        pkgconfig_2.0.3         Matrix_1.7-0           
#>  [31] R6_2.5.1                fastmap_1.1.1           GenomeInfoDbData_1.2.12
#>  [34] clue_0.3-65             digest_0.6.35           colorspace_2.1-0       
#>  [37] patchwork_1.2.0         AnnotationDbi_1.65.2    S4Vectors_0.42.0       
#>  [40] textshaping_0.3.7       Hmisc_5.1-2             RSQLite_2.3.6          
#>  [43] ggpubr_0.6.0            filelock_1.0.3          latex2exp_0.9.6        
#>  [46] fansi_1.0.6             httr_1.4.7              abind_1.4-5            
#>  [49] compiler_4.4.0          withr_3.0.0             bit64_4.0.5            
#>  [52] doParallel_1.0.17       htmlTable_2.4.2         backports_1.4.1        
#>  [55] BiocParallel_1.38.0     carData_3.0-5           DBI_1.2.2              
#>  [58] ggsignif_0.6.4          rjson_0.2.21            tools_4.4.0            
#>  [61] vipor_0.4.7             foreign_0.8-86          beeswarm_0.4.0         
#>  [64] msigdbr_7.5.1           nnet_7.3-19             glue_1.7.0             
#>  [67] grid_4.4.0              checkmate_2.3.1         cluster_2.1.6          
#>  [70] fgsea_1.30.0            generics_0.1.3          gtable_0.3.5           
#>  [73] preprocessCore_1.66.0   data.table_1.15.4       WGCNA_1.72-5           
#>  [76] car_3.1-2               utf8_1.2.4              XVector_0.44.0         
#>  [79] foreach_1.5.2           pillar_1.9.0            stringr_1.5.1          
#>  [82] babelgene_22.9          limma_3.60.0            circlize_0.4.16        
#>  [85] splines_4.4.0           BiocFileCache_2.12.0    lattice_0.22-6         
#>  [88] survival_3.5-8          bit_4.0.5               tidyselect_1.2.1       
#>  [91] GO.db_3.19.1            ComplexHeatmap_2.20.0   locfit_1.5-9.9         
#>  [94] Biostrings_2.72.0       knitr_1.46              gridExtra_2.3          
#>  [97] IRanges_2.38.0          edgeR_4.2.0             stats4_4.4.0           
#> [100] xfun_0.43               statmod_1.5.0           matrixStats_1.3.0      
#> [103] stringi_1.8.3           UCSC.utils_1.0.0        yaml_2.3.8             
#> [106] evaluate_0.23           codetools_0.2-20        cli_3.6.2              
#> [109] ontologyIndex_2.12      rpart_4.1.23            systemfonts_1.0.6      
#> [112] munsell_0.5.1           jquerylib_0.1.4         Rcpp_1.0.12            
#> [115] GenomeInfoDb_1.40.0     dbplyr_2.5.0            png_0.1-8              
#> [118] fastcluster_1.2.6       parallel_4.4.0          pkgdown_2.0.9          
#> [121] ggplot2_3.5.1           blob_1.2.4              scales_1.3.0           
#> [124] crayon_1.5.2            GetoptLong_1.0.5        rlang_1.1.3            
#> [127] cowplot_1.1.3           fastmatch_1.1-4         KEGGREST_1.44.0

References

1. Korotkevich, G., Sukhov, V., & Sergushichev, A. (2019). Fast gene set enrichment analysis. bioRxiv. https://doi.org/10.1101/060012