# Reformat DEA results
human_res <- PHOSPHO_DA %>%
  map(function(res_i) {
    filter(res_i, !is.na(human_uniprot)) %>%
      mutate(num_sites = str_count(human_site, ";") + 1) %>%
      separate_rows(human_site) %>% # single-site-level data
      mutate(human_feature = paste0(human_uniprot, "_", human_site)) %>%
      dplyr::select(contrast, human_feature, logFC, P.Value, num_sites) %>%
      distinct()
  })

map(human_res, head)
#> $trained_vs_SED
#> # A tibble: 6 × 5
#>   contrast     human_feature   logFC P.Value num_sites
#>   <fct>        <chr>           <dbl>   <dbl>     <dbl>
#> 1 F_1W - F_SED Q9UHL9_S448   -0.105    0.669         1
#> 2 F_1W - F_SED P02545_S12     0.189    0.344         1
#> 3 F_1W - F_SED P02545_S143    0.157    0.448         1
#> 4 F_1W - F_SED P02545_S153   -0.0221   0.937         1
#> 5 F_1W - F_SED P02545_S17     0.0838   0.607         1
#> 6 F_1W - F_SED P02545_S18    -0.0247   0.834         1
#> 
#> $MvF_SED
#> # A tibble: 6 × 5
#>   contrast      human_feature   logFC P.Value num_sites
#>   <fct>         <chr>           <dbl>   <dbl>     <dbl>
#> 1 M_SED - F_SED Q9UHL9_S448   -0.123  0.657           1
#> 2 M_SED - F_SED P02545_S12     0.193  0.378           1
#> 3 M_SED - F_SED P02545_S143   -0.789  0.00131         1
#> 4 M_SED - F_SED P02545_S153   -0.0890 0.800           1
#> 5 M_SED - F_SED P02545_S17    -0.366  0.0401          1
#> 6 M_SED - F_SED P02545_S18    -0.161  0.216           1
#> 
#> $MvF_exercise_response
#> # A tibble: 6 × 5
#>   contrast                        human_feature   logFC P.Value num_sites
#>   <fct>                           <chr>           <dbl>   <dbl>     <dbl>
#> 1 (M_1W - M_SED) - (F_1W - F_SED) Q9UHL9_S448    0.289    0.474         1
#> 2 (M_1W - M_SED) - (F_1W - F_SED) P02545_S12    -0.339    0.264         1
#> 3 (M_1W - M_SED) - (F_1W - F_SED) P02545_S143    0.130    0.689         1
#> 4 (M_1W - M_SED) - (F_1W - F_SED) P02545_S153   -0.355    0.431         1
#> 5 (M_1W - M_SED) - (F_1W - F_SED) P02545_S17    -0.0562   0.817         1
#> 6 (M_1W - M_SED) - (F_1W - F_SED) P02545_S18    -0.0172   0.923         1
# List of substrate sites by kinase (379 kinases before filtering)
KS_sets <- PSP_KINASE_SUBSTRATE %>%
  transmute(kinase = GENE,
            substrate = paste0(SUB_ACC_ID, "_", SUB_MOD_RSD)) %>%
  # Filter to what is in the DEA results
  filter(substrate %in% human_res$MvF_SED$human_feature) %>%
  group_by(kinase) %>%
  summarise(substrate = list(substrate)) %>%
  deframe()
head(KS_sets)
#> $AAK1
#> [1] "Q96CW1_T156"
#> 
#> $ABL1
#> [1] "P49023_Y118" "P46109_Y207" "Q03135_Y14"  "Q14498_Y95"  "P16333_Y105"
#> [6] "P46108_Y251" "P52566_Y130"
#> 
#> $AKT1
#>  [1] "O60343_S318"  "Q2PPJ7_T715"  "P19634_S703"  "Q8TDN4_T415"  "P62136_T320" 
#>  [6] "P98177_S262"  "P08727_S35"   "Q9Y3M2_S20"   "P53396_S455"  "O60825_S466" 
#> [11] "Q07352_S92"   "Q14315_S2233" "P98177_T32"   "Q13131_S496"  "P04150_S134" 
#> [16] "P14618_S97"   "Q53EL6_S457"  "P26373_S106"  "Q96F86_S161"  "Q99623_S91"  
#> [21] "Q9UBP6_S27"   "P49815_S981"  "Q12778_S256"  "P53396_T447"  "O60343_S570" 
#> [26] "P46937_S127"  "P45985_S80"   "Q96QB1_S766"  "Q16875_S461"  "Q12778_T24"  
#> [31] "P26358_S143"  "Q92934_S99"   "O60825_S483"  "Q9BZQ8_S602"  "P09651_S199" 
#> [36] "P42858_S419"  "P49840_S21"   "P04792_S82"   "O15151_S367"  "Q7Z5H3_S16"  
#> [41] "P02545_S404"  "Q3V6T2_S1417" "Q13045_S436"  "P19634_S796"  "Q9UN36_T348" 
#> [46] "P29474_S1177" "P49815_S939"  "P49841_S9"    "Q9UN36_S332"  "Q9H6Z4_S126" 
#> [51] "Q96B36_T246"  "Q5XUX0_S33"   "O43524_T32"   "O60343_S341"  "P98177_S197" 
#> [56] "Q12778_S319"  "P15056_S365"  "P62753_S236"  "Q9P0V3_S246"  "O00429_S616" 
#> [61] "P54253_S775"  "Q9BRS8_S451"  "P14618_S202"  "P21333_S2152" "O60343_T642" 
#> [66] "P08670_S39"   "P20749_S41"   "O60343_S588"  "P46527_S10"   "Q96QB1_S1004"
#> [71] "Q9UNN5_S582"  "Q2PPJ7_S486"  "O60331_S555"  "P35222_S552"  "P02545_S301" 
#> [76] "Q9Y4P1_S34"   "Q8WX93_S1118" "Q9Y2I7_S307"  "Q7Z6J0_S304"  "O43524_S253" 
#> [81] "P05787_S432"  "Q7Z589_S209"  "Q15121_S116" 
#> 
#> $AKT2
#>  [1] "Q9Y3M2_S20"   "Q14315_S2233" "Q96F86_S161"  "Q04656_S1466" "Q99623_S91"  
#>  [6] "Q9Y4H2_S306"  "O60331_S555"  "P35222_S552"  "Q9Y4H2_S577"  "Q7Z6J0_S304" 
#> 
#> $AKT3
#> [1] "Q9UKV8_S387" "P15056_S365" "Q7Z589_S209"
#> 
#> $ALK
#> [1] "P40763_Y705" "P50552_Y39"  "Q06124_Y542"

# How many substrates are in PSP?
table(unique(human_res$MvF_SED$human_feature) %in% unlist(KS_sets))
#> 
#> FALSE  TRUE 
#> 18019  1118
# FALSE  TRUE
# 18019  1118

Only about 5.8% of the substrates in the phosphoproteomics data are in PhosphoSitePlus.

# How many will pass the size filter?
KS_sets <- KS_sets[lengths(KS_sets) >= 3]
length(KS_sets) # 121 kinases
#> [1] 121

# Removing small kinase sets drops some substrate sites
table(unique(human_res$MvF_SED$human_feature) %in% unlist(KS_sets))
#> 
#> FALSE  TRUE 
#> 18063  1074
# FALSE  TRUE
# 18063  1074

## Conversion vectors for KSEA leadingEdge
# Uniprot to gene symbol sites
human_uniprot_to_symbol <- PSP_KINASE_SUBSTRATE %>%
  transmute(across(c(SUB_ACC_ID, SUB_GENE),
                   ~ paste0(.x, "_", SUB_MOD_RSD))) %>%
  distinct() %>%
  deframe()
head(human_uniprot_to_symbol)
#>    P05198_S52    P05198_S49   Q9UQL6_S259 P18433-2_S204    P10415_S70 
#>  "EIF2S1_S52"  "EIF2S1_S49"  "HDAC5_S259"  "PTPRA_S204"    "BCL2_S70" 
#>   P61978_S302 
#> "HNRNPK_S302"

# Map human to rat sites (single-site level)
human_to_rat <- fData(PHOSPHO_EXP) %>%
  filter(!is.na(human_uniprot)) %>%
  separate_rows(site, human_site, sep = ";") %>%
  transmute(human = paste0(human_uniprot, "_", human_site),
            rat = paste0(gene_symbol, "_", site)) %>%
  distinct() %>%
  deframe()
head(human_to_rat)
#>     Q9UHL9_S448      P02545_S12     P02545_S143     P02545_S153      P02545_S17 
#> "Gtf2ird1_S448"      "Lmna_S12"     "Lmna_S143"     "Lmna_S153"      "Lmna_S17" 
#>      P02545_S18 
#>      "Lmna_S18"
# KSEA
PHOSPHO_KSEA <- map(human_res, function(res_i)
{
  rank_list <- rank_genes(res_i, genes = "human_feature")

  map(names(rank_list), function(contr_i) {
    set.seed(0)
    fgseaMultilevel(pathways = KS_sets,
                    stats = rank_list[[contr_i]],
                    minSize = 3,
                    nproc = 1, nPermSimple = 10000) %>%
      mutate(contrast = contr_i)
  }) %>%
    rbindlist() %>%
    mutate(padj = p.adjust(pval, method = "BH"),
           contrast = factor(contrast, levels = unique(contrast)),
           gs_subcat = "kinase",
           leadingEdge_symbol = map(.x = leadingEdge,
                                    .f = ~ human_uniprot_to_symbol[.x]),
           leadingEdge_rno = map(.x = leadingEdge,
                                 .f = ~ human_to_rat[.x])) %>%
    dplyr::rename(kinase = pathway) %>%
    relocate(contrast, .before = leadingEdge) %>%
    dplyr::select(-gs_subcat)
})
# Save
usethis::use_data(PHOSPHO_KSEA, internal = FALSE,
                  overwrite = TRUE, version = 3, compress = "bzip2")

Session Info

sessionInfo()
#> R version 4.4.0 (2024-04-24)
#> Platform: x86_64-pc-linux-gnu
#> Running under: Ubuntu 22.04.4 LTS
#> 
#> Matrix products: default
#> BLAS:   /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3 
#> LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.20.so;  LAPACK version 3.10.0
#> 
#> locale:
#>  [1] LC_CTYPE=C.UTF-8       LC_NUMERIC=C           LC_TIME=C.UTF-8       
#>  [4] LC_COLLATE=C.UTF-8     LC_MONETARY=C.UTF-8    LC_MESSAGES=C.UTF-8   
#>  [7] LC_PAPER=C.UTF-8       LC_NAME=C              LC_ADDRESS=C          
#> [10] LC_TELEPHONE=C         LC_MEASUREMENT=C.UTF-8 LC_IDENTIFICATION=C   
#> 
#> time zone: UTC
#> tzcode source: system (glibc)
#> 
#> attached base packages:
#> [1] stats     graphics  grDevices utils     datasets  methods   base     
#> 
#> other attached packages:
#>  [1] fgsea_1.30.0                       data.table_1.15.4                 
#>  [3] stringr_1.5.1                      purrr_1.0.2                       
#>  [5] tibble_3.2.1                       tidyr_1.3.1                       
#>  [7] dplyr_1.1.4                        MotrpacRatTraining6moWAT_1.0.1    
#>  [9] Biobase_2.64.0                     BiocGenerics_0.50.0               
#> [11] MotrpacRatTraining6moWATData_2.0.0
#> 
#> loaded via a namespace (and not attached):
#>   [1] RColorBrewer_1.1-3      rstudioapi_0.16.0       jsonlite_1.8.8         
#>   [4] shape_1.4.6.1           magrittr_2.0.3          ggbeeswarm_0.7.2       
#>   [7] rmarkdown_2.26          GlobalOptions_0.1.2     fs_1.6.4               
#>  [10] zlibbioc_1.50.0         ragg_1.3.0              vctrs_0.6.5            
#>  [13] memoise_2.0.1           base64enc_0.1-3         rstatix_0.7.2          
#>  [16] htmltools_0.5.8.1       dynamicTreeCut_1.63-1   curl_5.2.1             
#>  [19] broom_1.0.5             Formula_1.2-5           sass_0.4.9             
#>  [22] bslib_0.7.0             htmlwidgets_1.6.4       desc_1.4.3             
#>  [25] impute_1.78.0           cachem_1.0.8            lifecycle_1.0.4        
#>  [28] iterators_1.0.14        pkgconfig_2.0.3         Matrix_1.7-0           
#>  [31] R6_2.5.1                fastmap_1.1.1           GenomeInfoDbData_1.2.12
#>  [34] clue_0.3-65             digest_0.6.35           colorspace_2.1-0       
#>  [37] patchwork_1.2.0         AnnotationDbi_1.65.2    S4Vectors_0.42.0       
#>  [40] textshaping_0.3.7       Hmisc_5.1-2             RSQLite_2.3.6          
#>  [43] ggpubr_0.6.0            filelock_1.0.3          latex2exp_0.9.6        
#>  [46] fansi_1.0.6             httr_1.4.7              abind_1.4-5            
#>  [49] compiler_4.4.0          withr_3.0.0             bit64_4.0.5            
#>  [52] doParallel_1.0.17       htmlTable_2.4.2         backports_1.4.1        
#>  [55] BiocParallel_1.38.0     carData_3.0-5           DBI_1.2.2              
#>  [58] ggsignif_0.6.4          rjson_0.2.21            tools_4.4.0            
#>  [61] vipor_0.4.7             foreign_0.8-86          beeswarm_0.4.0         
#>  [64] msigdbr_7.5.1           nnet_7.3-19             glue_1.7.0             
#>  [67] grid_4.4.0              checkmate_2.3.1         cluster_2.1.6          
#>  [70] generics_0.1.3          gtable_0.3.5            preprocessCore_1.66.0  
#>  [73] WGCNA_1.72-5            car_3.1-2               utf8_1.2.4             
#>  [76] XVector_0.44.0          foreach_1.5.2           pillar_1.9.0           
#>  [79] babelgene_22.9          limma_3.60.0            circlize_0.4.16        
#>  [82] splines_4.4.0           BiocFileCache_2.12.0    lattice_0.22-6         
#>  [85] survival_3.5-8          bit_4.0.5               tidyselect_1.2.1       
#>  [88] GO.db_3.19.1            ComplexHeatmap_2.20.0   locfit_1.5-9.9         
#>  [91] Biostrings_2.72.0       knitr_1.46              gridExtra_2.3          
#>  [94] IRanges_2.38.0          edgeR_4.2.0             stats4_4.4.0           
#>  [97] xfun_0.43               statmod_1.5.0           matrixStats_1.3.0      
#> [100] stringi_1.8.3           UCSC.utils_1.0.0        yaml_2.3.8             
#> [103] evaluate_0.23           codetools_0.2-20        cli_3.6.2              
#> [106] ontologyIndex_2.12      rpart_4.1.23            systemfonts_1.0.6      
#> [109] munsell_0.5.1           jquerylib_0.1.4         Rcpp_1.0.12            
#> [112] GenomeInfoDb_1.40.0     dbplyr_2.5.0            png_0.1-8              
#> [115] fastcluster_1.2.6       parallel_4.4.0          pkgdown_2.0.9          
#> [118] ggplot2_3.5.1           blob_1.2.4              scales_1.3.0           
#> [121] crayon_1.5.2            GetoptLong_1.0.5        rlang_1.1.3            
#> [124] cowplot_1.1.3           fastmatch_1.1-4         KEGGREST_1.44.0