Introduction

mass_dataset can be processed using most of the functions from tidyverse (dplyr). Before use the functions from tidyverse, we need to tell it which dataset we want to process (activate_mass_dataset()).

Data preparation

library(massdataset)
library(tidyverse)

data("expression_data")
data("sample_info")
data("sample_info_note")
data("variable_info")
data("variable_info_note")

object =
  create_mass_dataset(
    expression_data = expression_data,
    sample_info = sample_info,
    variable_info = variable_info,
    sample_info_note = sample_info_note,
    variable_info_note = variable_info_note
  )

glimpse() function

object %>% 
  activate_mass_dataset(what = "sample_info") %>% 
  glimpse()
#> Rows: 8
#> Columns: 4
#> $ sample_id       <chr> "Blank_3", "Blank_4", "QC_1", "QC_2", "PS4P1", "PS4P2"…
#> $ injection.order <dbl> 1, 2, 3, 4, 5, 6, 7, 8
#> $ class           <chr> "Blank", "Blank", "QC", "QC", "Subject", "Subject", "S…
#> $ group           <chr> "Blank", "Blank", "QC", "QC", "Subject", "Subject", "S…
object %>% 
  activate_mass_dataset(what = "expression_data") %>% 
  glimpse()
#> Rows: 1,000
#> Columns: 8
#> $ Blank_3 <dbl> NA, NA, NA, NA, NA, 3761892.6, NA, NA, NA, 249352.6, NA, NA, N…
#> $ Blank_4 <dbl> NA, NA, NA, NA, NA, 2572593.4, NA, NA, NA, 131374.5, NA, NA, N…
#> $ QC_1    <dbl> 1857924.8, 2821550.2, 410387.6, NA, 8730104.8, NA, 3688690.6, …
#> $ QC_2    <dbl> 1037763.8, 1304875.3, 273687.8, NA, 4105598.5, 3662819.1, 2892…
#> $ PS4P1   <dbl> 1494436.1, 2471336.1, 288590.2, NA, 5141073.2, 5700534.8, 1401…
#> $ PS4P2   <dbl> 3496912.1, 3333582.7, 137297.5, 5059068.1, 8424315.6, 4600172.…
#> $ PS4P3   <dbl> 1959178.81, 2734243.82, NA, 5147421.59, 7896633.30, 5557014.59…
#> $ PS4P4   <dbl> 1005418.77, 3361452.28, 271318.30, NA, 6441448.99, 4433034.18,…
object %>% 
  activate_mass_dataset(what = "variable_info") %>% 
  glimpse()
#> Rows: 1,000
#> Columns: 3
#> $ variable_id <chr> "M136T55_2_POS", "M79T35_POS", "M307T548_POS", "M183T224_P…
#> $ mz          <dbl> 136.06140, 79.05394, 307.14035, 183.06209, 349.01584, 181.…
#> $ rt          <dbl> 54.97902, 35.36550, 547.56641, 224.32777, 47.00262, 828.35…
object %>% 
  activate_mass_dataset(what = "annotation_table") %>% 
  glimpse()
#> Rows: 0
#> Columns: 0

filter() function

object = 
object %>% 
  activate_mass_dataset(what = "sample_info")
object@activated
#> [1] "sample_info"
object = 
object %>% 
  activate_mass_dataset(what = "variable_info")
object@activated
#> [1] "variable_info"
object = 
object %>% 
  activate_mass_dataset(what = "expression_data")
object@activated
#> [1] "expression_data"

Filter samples

Only remain QC samples.

object2 = 
object %>% 
  activate_mass_dataset(what = "sample_info") %>% 
  dplyr::filter(class == "QC")
object2
#> -------------------- 
#> massdataset version: 0.99.14 
#> -------------------- 
#> 1.expression_data:[ 1000 x 2 data.frame]
#> 2.sample_info:[ 2 x 4 data.frame]
#> 3.variable_info:[ 1000 x 3 data.frame]
#> 4.sample_info_note:[ 4 x 2 data.frame]
#> 5.variable_info_note:[ 3 x 2 data.frame]
#> 6.ms2_data:[ 0 variables x 0 MS2 spectra]
#> -------------------- 
#> Processing information (extract_process_info())
#> create_mass_dataset ---------- 
#>       Package         Function.used                Time
#> 1 massdataset create_mass_dataset() 2022-03-05 14:43:25
#> filter ---------- 
#>       Package Function.used                Time
#> 1 massdataset      filter() 2022-03-11 19:08:37

Filter variables

Only remain variables which are not NA in QC_1.

object2 = 
object %>% 
  activate_mass_dataset(what = "expression_data") %>% 
  dplyr::filter(!is.na(QC_1))
object2
#> -------------------- 
#> massdataset version: 0.99.14 
#> -------------------- 
#> 1.expression_data:[ 603 x 8 data.frame]
#> 2.sample_info:[ 8 x 4 data.frame]
#> 3.variable_info:[ 603 x 3 data.frame]
#> 4.sample_info_note:[ 4 x 2 data.frame]
#> 5.variable_info_note:[ 3 x 2 data.frame]
#> 6.ms2_data:[ 0 variables x 0 MS2 spectra]
#> -------------------- 
#> Processing information (extract_process_info())
#> create_mass_dataset ---------- 
#>       Package         Function.used                Time
#> 1 massdataset create_mass_dataset() 2022-03-05 14:43:25
#> filter ---------- 
#>       Package Function.used                Time
#> 1 massdataset      filter() 2022-03-11 19:08:37

Only remain variables with RSD > 30 in subject samples

subject_id = 
  object %>% 
  activate_mass_dataset(what = "sample_info") %>% 
  filter(class == "Subject") %>% 
  pull(sample_id)

object2 = 
object %>% 
  mutate_rsd(according_to_samples = subject_id) %>% 
  activate_mass_dataset(what = "variable_info") %>% 
  dplyr::filter(rsd > 50)
object2
#> -------------------- 
#> massdataset version: 0.99.14 
#> -------------------- 
#> 1.expression_data:[ 142 x 8 data.frame]
#> 2.sample_info:[ 8 x 4 data.frame]
#> 3.variable_info:[ 142 x 4 data.frame]
#> 4.sample_info_note:[ 4 x 2 data.frame]
#> 5.variable_info_note:[ 4 x 2 data.frame]
#> 6.ms2_data:[ 0 variables x 0 MS2 spectra]
#> -------------------- 
#> Processing information (extract_process_info())
#> create_mass_dataset ---------- 
#>       Package         Function.used                Time
#> 1 massdataset create_mass_dataset() 2022-03-05 14:43:25
#> mutate_rsd ---------- 
#>       Package Function.used                Time
#> 1 massdataset  mutate_rsd() 2022-03-11 19:08:37
#> filter ---------- 
#>       Package Function.used                Time
#> 1 massdataset      filter() 2022-03-11 19:08:37

extract_variable_info(object2) %>% 
  head()
#>                 variable_id        mz        rt      rsd
#> M136T55_2_POS M136T55_2_POS 136.06140  54.97902 54.20205
#> M299T359_POS   M299T359_POS 299.13754 359.47934 51.60401
#> M270T507_POS   M270T507_POS 270.14035 506.53494 60.73094
#> M93T311_POS     M93T311_POS  93.05433 311.45825 54.20045
#> M267T242_POS   M267T242_POS 267.13025 241.56941 64.92011
#> M291T246_POS   M291T246_POS 291.01947 246.10054 56.09987

mutate() function

###add new column in sample_info
object2 = 
  object %>% 
  activate_mass_dataset(what = "sample_info") %>% 
  mutate(sample_id2 = sample_id)

extract_sample_info(object2)
#>   sample_id injection.order   class   group sample_id2
#> 1   Blank_3               1   Blank   Blank    Blank_3
#> 2   Blank_4               2   Blank   Blank    Blank_4
#> 3      QC_1               3      QC      QC       QC_1
#> 4      QC_2               4      QC      QC       QC_2
#> 5     PS4P1               5 Subject Subject      PS4P1
#> 6     PS4P2               6 Subject Subject      PS4P2
#> 7     PS4P3               7 Subject Subject      PS4P3
#> 8     PS4P4               8 Subject Subject      PS4P4

###add new column in variabe_info
object2 = 
  object %>% 
  activate_mass_dataset(what = "variable_info") %>% 
  mutate(variable_id2 = variable_id)
extract_variable_info(object2) %>% 
  head()
#>     variable_id        mz        rt  variable_id2
#> 1 M136T55_2_POS 136.06140  54.97902 M136T55_2_POS
#> 2    M79T35_POS  79.05394  35.36550    M79T35_POS
#> 3  M307T548_POS 307.14035 547.56641  M307T548_POS
#> 4  M183T224_POS 183.06209 224.32777  M183T224_POS
#> 5   M349T47_POS 349.01584  47.00262   M349T47_POS
#> 6  M182T828_POS 181.99775 828.35712  M182T828_POS

###add new samples in expression_data
object2 = 
  object %>% 
  activate_mass_dataset(what = "expression_data") %>% 
  mutate(new_sample = QC_2)
head(extract_expression_data(object2[,1:5]))
#>               Blank_3 Blank_4      QC_1      QC_2     PS4P1
#> M136T55_2_POS      NA      NA 1857924.8 1037763.8 1494436.1
#> M79T35_POS         NA      NA 2821550.2 1304875.3 2471336.1
#> M307T548_POS       NA      NA  410387.6  273687.8  288590.2
#> M183T224_POS       NA      NA        NA        NA        NA
#> M349T47_POS        NA      NA 8730104.8 4105598.5 5141073.2
#> M182T828_POS  3761893 2572593        NA 3662819.1 5700534.8

add_column() function

We can also use the add_column() from tibble package to add new column.

# object %>%
#   activate_mass_dataset(what = "expression_data") %>% 
#   add_column(x = NA)

left_join() function

new_sample_info = 
  data.frame(sample_id = c("PS4P1", "PS4P2"), 
             BMI = c(20, 22))

object2 =
object %>% 
  activate_mass_dataset(what = "sample_info") %>% 
  left_join(new_sample_info, by = "sample_id")

extract_sample_info(object2) %>% head()
#>   sample_id injection.order   class   group BMI
#> 1   Blank_3               1   Blank   Blank  NA
#> 2   Blank_4               2   Blank   Blank  NA
#> 3      QC_1               3      QC      QC  NA
#> 4      QC_2               4      QC      QC  NA
#> 5     PS4P1               5 Subject Subject  20
#> 6     PS4P2               6 Subject Subject  22

pull() function

object %>% 
  activate_mass_dataset(what = "sample_info") %>% 
  pull(group)
#> [1] "Blank"   "Blank"   "QC"      "QC"      "Subject" "Subject" "Subject"
#> [8] "Subject"

rename() function

object2 = 
object %>% 
  activate_mass_dataset(what = "expression_data") %>% 
  rename(Blank_10 = Blank_3)
colnames(object2)
#> [1] "Blank_10" "Blank_4"  "QC_1"     "QC_2"     "PS4P1"    "PS4P2"    "PS4P3"   
#> [8] "PS4P4"
extract_sample_info(object2)
#>   sample_id injection.order   class   group
#> 1  Blank_10               1   Blank   Blank
#> 2   Blank_4               2   Blank   Blank
#> 3      QC_1               3      QC      QC
#> 4      QC_2               4      QC      QC
#> 5     PS4P1               5 Subject Subject
#> 6     PS4P2               6 Subject Subject
#> 7     PS4P3               7 Subject Subject
#> 8     PS4P4               8 Subject Subject

arrange() function

object2 = 
object %>% 
  activate_mass_dataset(what = "sample_info") %>% 
  arrange(desc(class))
extract_sample_info(object)
#>   sample_id injection.order   class   group
#> 1   Blank_3               1   Blank   Blank
#> 2   Blank_4               2   Blank   Blank
#> 3      QC_1               3      QC      QC
#> 4      QC_2               4      QC      QC
#> 5     PS4P1               5 Subject Subject
#> 6     PS4P2               6 Subject Subject
#> 7     PS4P3               7 Subject Subject
#> 8     PS4P4               8 Subject Subject
extract_sample_info(object2)
#>   sample_id injection.order   class   group
#> 1     PS4P1               5 Subject Subject
#> 2     PS4P2               6 Subject Subject
#> 3     PS4P3               7 Subject Subject
#> 4     PS4P4               8 Subject Subject
#> 5      QC_1               3      QC      QC
#> 6      QC_2               4      QC      QC
#> 7   Blank_3               1   Blank   Blank
#> 8   Blank_4               2   Blank   Blank

relocate() function

object2 = 
object %>% 
  activate_mass_dataset(what = "expression_data") %>% 
  relocate(QC_1, everything())
colnames(object)
#> [1] "Blank_3" "Blank_4" "QC_1"    "QC_2"    "PS4P1"   "PS4P2"   "PS4P3"  
#> [8] "PS4P4"
colnames(object2)
#> [1] "QC_1"    "Blank_3" "Blank_4" "QC_2"    "PS4P1"   "PS4P2"   "PS4P3"  
#> [8] "PS4P4"

slice() function

object %>%
  activate_mass_dataset(what = "expression_data") %>% 
  slice(1:10)
#> -------------------- 
#> massdataset version: 0.99.14 
#> -------------------- 
#> 1.expression_data:[ 10 x 8 data.frame]
#> 2.sample_info:[ 8 x 4 data.frame]
#> 3.variable_info:[ 10 x 3 data.frame]
#> 4.sample_info_note:[ 4 x 2 data.frame]
#> 5.variable_info_note:[ 3 x 2 data.frame]
#> 6.ms2_data:[ 0 variables x 0 MS2 spectra]
#> -------------------- 
#> Processing information (extract_process_info())
#> create_mass_dataset ---------- 
#>       Package         Function.used                Time
#> 1 massdataset create_mass_dataset() 2022-03-05 14:43:25
#> slice ---------- 
#>       Package Function.used                Time
#> 1 massdataset       slice() 2022-03-11 20:23:24

object %>%
  activate_mass_dataset(what = "expression_data") %>% 
  dplyr::arrange(QC_2) %>% 
  slice_head(n = 3)
#> -------------------- 
#> massdataset version: 0.99.14 
#> -------------------- 
#> 1.expression_data:[ 3 x 8 data.frame]
#> 2.sample_info:[ 8 x 4 data.frame]
#> 3.variable_info:[ 3 x 3 data.frame]
#> 4.sample_info_note:[ 4 x 2 data.frame]
#> 5.variable_info_note:[ 3 x 2 data.frame]
#> 6.ms2_data:[ 0 variables x 0 MS2 spectra]
#> -------------------- 
#> Processing information (extract_process_info())
#> create_mass_dataset ---------- 
#>       Package         Function.used                Time
#> 1 massdataset create_mass_dataset() 2022-03-05 14:43:25
#> slice_head ---------- 
#>       Package Function.used                Time
#> 1 massdataset  slice_head() 2022-03-11 20:23:24

select() function

object2 =
  object %>%
  activate_mass_dataset(what = "expression_data") %>%
  select(-Blank_3)

colnames(object2)
#> [1] "Blank_4" "QC_1"    "QC_2"    "PS4P1"   "PS4P2"   "PS4P3"   "PS4P4"

object2 =
  object %>%
  activate_mass_dataset(what = "expression_data") %>%
  select(-contains("Blank"))
colnames(object2)
#> [1] "QC_1"  "QC_2"  "PS4P1" "PS4P2" "PS4P3" "PS4P4"

group_by() and summarize() function

object %>% 
  activate_mass_dataset(what = "sample_info") %>% 
  group_by(class) %>% 
  summarise(n = n())
#> # A tibble: 3 × 2
#>   class       n
#>   <chr>   <int>
#> 1 Blank       2
#> 2 QC          2
#> 3 Subject     4

count() function

object %>% 
  activate_mass_dataset(what = "sample_info") %>% 
  count(class)
#>     class n
#> 1   Blank 2
#> 2      QC 2
#> 3 Subject 4

drop_na() function

drop_na() from tidyr package.

object2 = 
object %>% 
  activate_mass_dataset(what = "expression_data") %>% 
  drop_na(QC_2)
object$QC_2 %>% head()
#> [1] 1037763.8 1304875.3  273687.8        NA 4105598.5 3662819.1
object2$QC_2 %>% head()
#> [1] 1037763.8 1304875.3  273687.8 4105598.5 3662819.1 2892719.6

pivot_longer() function

pivot_longer() from tidyr package.

pivot_longer(object) %>% 
  head()
#>     variable_id sample_id   value
#> 1 M136T55_2_POS   Blank_3      NA
#> 2 M136T55_2_POS   Blank_4      NA
#> 3 M136T55_2_POS      QC_1 1857925
#> 4 M136T55_2_POS      QC_2 1037764
#> 5 M136T55_2_POS     PS4P1 1494436
#> 6 M136T55_2_POS     PS4P2 3496912

Session information

sessionInfo()
#> R version 4.1.2 (2021-11-01)
#> Platform: x86_64-apple-darwin17.0 (64-bit)
#> Running under: macOS Big Sur 10.16
#> 
#> Matrix products: default
#> BLAS:   /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRblas.0.dylib
#> LAPACK: /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRlapack.dylib
#> 
#> locale:
#> [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
#> 
#> attached base packages:
#> [1] stats     graphics  grDevices utils     datasets  methods   base     
#> 
#> other attached packages:
#>  [1] forcats_0.5.1.9000  stringr_1.4.0       dplyr_1.0.8        
#>  [4] purrr_0.3.4         readr_2.1.2         tidyr_1.2.0        
#>  [7] tibble_3.1.6        tidyverse_1.3.1     tinytools_0.9.1    
#> [10] ggplot2_3.3.5       magrittr_2.0.2      masstools_0.99.5   
#> [13] massdataset_0.99.22
#> 
#> loaded via a namespace (and not attached):
#>   [1] colorspace_2.0-2      rjson_0.2.21          ellipsis_0.3.2       
#>   [4] leaflet_2.1.0         rprojroot_2.0.2       circlize_0.4.14      
#>   [7] GlobalOptions_0.1.2   fs_1.5.2              clue_0.3-60          
#>  [10] rstudioapi_0.13       mzR_2.28.0            affyio_1.64.0        
#>  [13] lubridate_1.8.0       fansi_1.0.2           xml2_1.3.3           
#>  [16] codetools_0.2-18      ncdf4_1.19            doParallel_1.0.17    
#>  [19] cachem_1.0.6          impute_1.68.0         knitr_1.37           
#>  [22] jsonlite_1.7.3        broom_0.7.12          dbplyr_2.1.1         
#>  [25] cluster_2.1.2         vsn_3.62.0            png_0.1-7            
#>  [28] BiocManager_1.30.16   compiler_4.1.2        httr_1.4.2           
#>  [31] backports_1.4.1       assertthat_0.2.1      fastmap_1.1.0        
#>  [34] lazyeval_0.2.2        limma_3.50.0          cli_3.2.0            
#>  [37] htmltools_0.5.2       tools_4.1.2           gtable_0.3.0         
#>  [40] glue_1.6.1            affy_1.72.0           Rcpp_1.0.8           
#>  [43] MALDIquant_1.21       Biobase_2.54.0        cellranger_1.1.0     
#>  [46] jquerylib_0.1.4       pkgdown_2.0.2         vctrs_0.3.8          
#>  [49] preprocessCore_1.56.0 iterators_1.0.14      crosstalk_1.2.0      
#>  [52] xfun_0.29             rvest_1.0.2           openxlsx_4.2.5       
#>  [55] lifecycle_1.0.1       XML_3.99-0.8          MASS_7.3-55          
#>  [58] zlibbioc_1.40.0       scales_1.1.1          MSnbase_2.20.4       
#>  [61] ragg_1.2.1            pcaMethods_1.86.0     hms_1.1.1            
#>  [64] ProtGenerics_1.26.0   parallel_4.1.2        RColorBrewer_1.1-2   
#>  [67] ComplexHeatmap_2.10.0 yaml_2.3.4            memoise_2.0.1        
#>  [70] pbapply_1.5-0         yulab.utils_0.0.4     sass_0.4.0           
#>  [73] stringi_1.7.6         S4Vectors_0.32.3      desc_1.4.0           
#>  [76] foreach_1.5.2         BiocGenerics_0.40.0   zip_2.2.0            
#>  [79] BiocParallel_1.28.3   shape_1.4.6           rlang_1.0.1          
#>  [82] pkgconfig_2.0.3       systemfonts_1.0.3     matrixStats_0.61.0   
#>  [85] mzID_1.32.0           evaluate_0.15         lattice_0.20-45      
#>  [88] htmlwidgets_1.5.4     tidyselect_1.1.1      ggsci_2.9            
#>  [91] plyr_1.8.6            R6_2.5.1              IRanges_2.28.0       
#>  [94] generics_0.1.2        DBI_1.1.2             haven_2.4.3          
#>  [97] pillar_1.7.0          withr_2.4.3           MsCoreUtils_1.6.0    
#> [100] modelr_0.1.8          crayon_1.5.0          utf8_1.2.2           
#> [103] plotly_4.10.0         tzdb_0.2.0            rmarkdown_2.11       
#> [106] GetoptLong_1.0.5      grid_4.1.2            readxl_1.3.1         
#> [109] data.table_1.14.2     reprex_2.0.1          digest_0.6.29        
#> [112] gridGraphics_0.5-1    textshaping_0.3.6     stats4_4.1.2         
#> [115] munsell_0.5.0         viridisLite_0.4.0     ggplotify_0.1.0      
#> [118] bslib_0.3.1