vignettes/tidyverse_verse.Rmd
tidyverse_verse.Rmd
mass_dataset
can be processed using most of the functions from tidyverse
(dplyr
). Before use the functions from tidyverse
, we need to tell it which dataset we want to process (activate_mass_dataset()
).
library(massdataset)
library(tidyverse)
data("expression_data")
data("sample_info")
data("sample_info_note")
data("variable_info")
data("variable_info_note")
object =
create_mass_dataset(
expression_data = expression_data,
sample_info = sample_info,
variable_info = variable_info,
sample_info_note = sample_info_note,
variable_info_note = variable_info_note
)
object %>%
activate_mass_dataset(what = "sample_info") %>%
glimpse()
#> Rows: 8
#> Columns: 4
#> $ sample_id <chr> "Blank_3", "Blank_4", "QC_1", "QC_2", "PS4P1", "PS4P2"…
#> $ injection.order <dbl> 1, 2, 3, 4, 5, 6, 7, 8
#> $ class <chr> "Blank", "Blank", "QC", "QC", "Subject", "Subject", "S…
#> $ group <chr> "Blank", "Blank", "QC", "QC", "Subject", "Subject", "S…
object %>%
activate_mass_dataset(what = "expression_data") %>%
glimpse()
#> Rows: 1,000
#> Columns: 8
#> $ Blank_3 <dbl> NA, NA, NA, NA, NA, 3761892.6, NA, NA, NA, 249352.6, NA, NA, N…
#> $ Blank_4 <dbl> NA, NA, NA, NA, NA, 2572593.4, NA, NA, NA, 131374.5, NA, NA, N…
#> $ QC_1 <dbl> 1857924.8, 2821550.2, 410387.6, NA, 8730104.8, NA, 3688690.6, …
#> $ QC_2 <dbl> 1037763.8, 1304875.3, 273687.8, NA, 4105598.5, 3662819.1, 2892…
#> $ PS4P1 <dbl> 1494436.1, 2471336.1, 288590.2, NA, 5141073.2, 5700534.8, 1401…
#> $ PS4P2 <dbl> 3496912.1, 3333582.7, 137297.5, 5059068.1, 8424315.6, 4600172.…
#> $ PS4P3 <dbl> 1959178.81, 2734243.82, NA, 5147421.59, 7896633.30, 5557014.59…
#> $ PS4P4 <dbl> 1005418.77, 3361452.28, 271318.30, NA, 6441448.99, 4433034.18,…
object %>%
activate_mass_dataset(what = "variable_info") %>%
glimpse()
#> Rows: 1,000
#> Columns: 3
#> $ variable_id <chr> "M136T55_2_POS", "M79T35_POS", "M307T548_POS", "M183T224_P…
#> $ mz <dbl> 136.06140, 79.05394, 307.14035, 183.06209, 349.01584, 181.…
#> $ rt <dbl> 54.97902, 35.36550, 547.56641, 224.32777, 47.00262, 828.35…
object %>%
activate_mass_dataset(what = "annotation_table") %>%
glimpse()
#> Rows: 0
#> Columns: 0
object =
object %>%
activate_mass_dataset(what = "sample_info")
object@activated
#> [1] "sample_info"
object =
object %>%
activate_mass_dataset(what = "variable_info")
object@activated
#> [1] "variable_info"
object =
object %>%
activate_mass_dataset(what = "expression_data")
object@activated
#> [1] "expression_data"
Only remain QC samples.
object2 =
object %>%
activate_mass_dataset(what = "sample_info") %>%
dplyr::filter(class == "QC")
object2
#> --------------------
#> massdataset version: 0.99.14
#> --------------------
#> 1.expression_data:[ 1000 x 2 data.frame]
#> 2.sample_info:[ 2 x 4 data.frame]
#> 3.variable_info:[ 1000 x 3 data.frame]
#> 4.sample_info_note:[ 4 x 2 data.frame]
#> 5.variable_info_note:[ 3 x 2 data.frame]
#> 6.ms2_data:[ 0 variables x 0 MS2 spectra]
#> --------------------
#> Processing information (extract_process_info())
#> create_mass_dataset ----------
#> Package Function.used Time
#> 1 massdataset create_mass_dataset() 2022-03-05 14:43:25
#> filter ----------
#> Package Function.used Time
#> 1 massdataset filter() 2022-03-11 19:08:37
Only remain variables which are not NA in QC_1.
object2 =
object %>%
activate_mass_dataset(what = "expression_data") %>%
dplyr::filter(!is.na(QC_1))
object2
#> --------------------
#> massdataset version: 0.99.14
#> --------------------
#> 1.expression_data:[ 603 x 8 data.frame]
#> 2.sample_info:[ 8 x 4 data.frame]
#> 3.variable_info:[ 603 x 3 data.frame]
#> 4.sample_info_note:[ 4 x 2 data.frame]
#> 5.variable_info_note:[ 3 x 2 data.frame]
#> 6.ms2_data:[ 0 variables x 0 MS2 spectra]
#> --------------------
#> Processing information (extract_process_info())
#> create_mass_dataset ----------
#> Package Function.used Time
#> 1 massdataset create_mass_dataset() 2022-03-05 14:43:25
#> filter ----------
#> Package Function.used Time
#> 1 massdataset filter() 2022-03-11 19:08:37
Only remain variables with RSD > 30 in subject samples
subject_id =
object %>%
activate_mass_dataset(what = "sample_info") %>%
filter(class == "Subject") %>%
pull(sample_id)
object2 =
object %>%
mutate_rsd(according_to_samples = subject_id) %>%
activate_mass_dataset(what = "variable_info") %>%
dplyr::filter(rsd > 50)
object2
#> --------------------
#> massdataset version: 0.99.14
#> --------------------
#> 1.expression_data:[ 142 x 8 data.frame]
#> 2.sample_info:[ 8 x 4 data.frame]
#> 3.variable_info:[ 142 x 4 data.frame]
#> 4.sample_info_note:[ 4 x 2 data.frame]
#> 5.variable_info_note:[ 4 x 2 data.frame]
#> 6.ms2_data:[ 0 variables x 0 MS2 spectra]
#> --------------------
#> Processing information (extract_process_info())
#> create_mass_dataset ----------
#> Package Function.used Time
#> 1 massdataset create_mass_dataset() 2022-03-05 14:43:25
#> mutate_rsd ----------
#> Package Function.used Time
#> 1 massdataset mutate_rsd() 2022-03-11 19:08:37
#> filter ----------
#> Package Function.used Time
#> 1 massdataset filter() 2022-03-11 19:08:37
extract_variable_info(object2) %>%
head()
#> variable_id mz rt rsd
#> M136T55_2_POS M136T55_2_POS 136.06140 54.97902 54.20205
#> M299T359_POS M299T359_POS 299.13754 359.47934 51.60401
#> M270T507_POS M270T507_POS 270.14035 506.53494 60.73094
#> M93T311_POS M93T311_POS 93.05433 311.45825 54.20045
#> M267T242_POS M267T242_POS 267.13025 241.56941 64.92011
#> M291T246_POS M291T246_POS 291.01947 246.10054 56.09987
###add new column in sample_info
object2 =
object %>%
activate_mass_dataset(what = "sample_info") %>%
mutate(sample_id2 = sample_id)
extract_sample_info(object2)
#> sample_id injection.order class group sample_id2
#> 1 Blank_3 1 Blank Blank Blank_3
#> 2 Blank_4 2 Blank Blank Blank_4
#> 3 QC_1 3 QC QC QC_1
#> 4 QC_2 4 QC QC QC_2
#> 5 PS4P1 5 Subject Subject PS4P1
#> 6 PS4P2 6 Subject Subject PS4P2
#> 7 PS4P3 7 Subject Subject PS4P3
#> 8 PS4P4 8 Subject Subject PS4P4
###add new column in variabe_info
object2 =
object %>%
activate_mass_dataset(what = "variable_info") %>%
mutate(variable_id2 = variable_id)
extract_variable_info(object2) %>%
head()
#> variable_id mz rt variable_id2
#> 1 M136T55_2_POS 136.06140 54.97902 M136T55_2_POS
#> 2 M79T35_POS 79.05394 35.36550 M79T35_POS
#> 3 M307T548_POS 307.14035 547.56641 M307T548_POS
#> 4 M183T224_POS 183.06209 224.32777 M183T224_POS
#> 5 M349T47_POS 349.01584 47.00262 M349T47_POS
#> 6 M182T828_POS 181.99775 828.35712 M182T828_POS
###add new samples in expression_data
object2 =
object %>%
activate_mass_dataset(what = "expression_data") %>%
mutate(new_sample = QC_2)
head(extract_expression_data(object2[,1:5]))
#> Blank_3 Blank_4 QC_1 QC_2 PS4P1
#> M136T55_2_POS NA NA 1857924.8 1037763.8 1494436.1
#> M79T35_POS NA NA 2821550.2 1304875.3 2471336.1
#> M307T548_POS NA NA 410387.6 273687.8 288590.2
#> M183T224_POS NA NA NA NA NA
#> M349T47_POS NA NA 8730104.8 4105598.5 5141073.2
#> M182T828_POS 3761893 2572593 NA 3662819.1 5700534.8
We can also use the add_column()
from tibble
package to add new column.
# object %>%
# activate_mass_dataset(what = "expression_data") %>%
# add_column(x = NA)
new_sample_info =
data.frame(sample_id = c("PS4P1", "PS4P2"),
BMI = c(20, 22))
object2 =
object %>%
activate_mass_dataset(what = "sample_info") %>%
left_join(new_sample_info, by = "sample_id")
extract_sample_info(object2) %>% head()
#> sample_id injection.order class group BMI
#> 1 Blank_3 1 Blank Blank NA
#> 2 Blank_4 2 Blank Blank NA
#> 3 QC_1 3 QC QC NA
#> 4 QC_2 4 QC QC NA
#> 5 PS4P1 5 Subject Subject 20
#> 6 PS4P2 6 Subject Subject 22
object %>%
activate_mass_dataset(what = "sample_info") %>%
pull(group)
#> [1] "Blank" "Blank" "QC" "QC" "Subject" "Subject" "Subject"
#> [8] "Subject"
object2 =
object %>%
activate_mass_dataset(what = "expression_data") %>%
rename(Blank_10 = Blank_3)
colnames(object2)
#> [1] "Blank_10" "Blank_4" "QC_1" "QC_2" "PS4P1" "PS4P2" "PS4P3"
#> [8] "PS4P4"
extract_sample_info(object2)
#> sample_id injection.order class group
#> 1 Blank_10 1 Blank Blank
#> 2 Blank_4 2 Blank Blank
#> 3 QC_1 3 QC QC
#> 4 QC_2 4 QC QC
#> 5 PS4P1 5 Subject Subject
#> 6 PS4P2 6 Subject Subject
#> 7 PS4P3 7 Subject Subject
#> 8 PS4P4 8 Subject Subject
object2 =
object %>%
activate_mass_dataset(what = "sample_info") %>%
arrange(desc(class))
extract_sample_info(object)
#> sample_id injection.order class group
#> 1 Blank_3 1 Blank Blank
#> 2 Blank_4 2 Blank Blank
#> 3 QC_1 3 QC QC
#> 4 QC_2 4 QC QC
#> 5 PS4P1 5 Subject Subject
#> 6 PS4P2 6 Subject Subject
#> 7 PS4P3 7 Subject Subject
#> 8 PS4P4 8 Subject Subject
extract_sample_info(object2)
#> sample_id injection.order class group
#> 1 PS4P1 5 Subject Subject
#> 2 PS4P2 6 Subject Subject
#> 3 PS4P3 7 Subject Subject
#> 4 PS4P4 8 Subject Subject
#> 5 QC_1 3 QC QC
#> 6 QC_2 4 QC QC
#> 7 Blank_3 1 Blank Blank
#> 8 Blank_4 2 Blank Blank
object2 =
object %>%
activate_mass_dataset(what = "expression_data") %>%
relocate(QC_1, everything())
colnames(object)
#> [1] "Blank_3" "Blank_4" "QC_1" "QC_2" "PS4P1" "PS4P2" "PS4P3"
#> [8] "PS4P4"
colnames(object2)
#> [1] "QC_1" "Blank_3" "Blank_4" "QC_2" "PS4P1" "PS4P2" "PS4P3"
#> [8] "PS4P4"
object %>%
activate_mass_dataset(what = "expression_data") %>%
slice(1:10)
#> --------------------
#> massdataset version: 0.99.14
#> --------------------
#> 1.expression_data:[ 10 x 8 data.frame]
#> 2.sample_info:[ 8 x 4 data.frame]
#> 3.variable_info:[ 10 x 3 data.frame]
#> 4.sample_info_note:[ 4 x 2 data.frame]
#> 5.variable_info_note:[ 3 x 2 data.frame]
#> 6.ms2_data:[ 0 variables x 0 MS2 spectra]
#> --------------------
#> Processing information (extract_process_info())
#> create_mass_dataset ----------
#> Package Function.used Time
#> 1 massdataset create_mass_dataset() 2022-03-05 14:43:25
#> slice ----------
#> Package Function.used Time
#> 1 massdataset slice() 2022-03-11 20:23:24
object %>%
activate_mass_dataset(what = "expression_data") %>%
dplyr::arrange(QC_2) %>%
slice_head(n = 3)
#> --------------------
#> massdataset version: 0.99.14
#> --------------------
#> 1.expression_data:[ 3 x 8 data.frame]
#> 2.sample_info:[ 8 x 4 data.frame]
#> 3.variable_info:[ 3 x 3 data.frame]
#> 4.sample_info_note:[ 4 x 2 data.frame]
#> 5.variable_info_note:[ 3 x 2 data.frame]
#> 6.ms2_data:[ 0 variables x 0 MS2 spectra]
#> --------------------
#> Processing information (extract_process_info())
#> create_mass_dataset ----------
#> Package Function.used Time
#> 1 massdataset create_mass_dataset() 2022-03-05 14:43:25
#> slice_head ----------
#> Package Function.used Time
#> 1 massdataset slice_head() 2022-03-11 20:23:24
object2 =
object %>%
activate_mass_dataset(what = "expression_data") %>%
select(-Blank_3)
colnames(object2)
#> [1] "Blank_4" "QC_1" "QC_2" "PS4P1" "PS4P2" "PS4P3" "PS4P4"
object2 =
object %>%
activate_mass_dataset(what = "expression_data") %>%
select(-contains("Blank"))
colnames(object2)
#> [1] "QC_1" "QC_2" "PS4P1" "PS4P2" "PS4P3" "PS4P4"
object %>%
activate_mass_dataset(what = "sample_info") %>%
count(class)
#> class n
#> 1 Blank 2
#> 2 QC 2
#> 3 Subject 4
drop_na()
from tidyr
package.
pivot_longer()
from tidyr
package.
pivot_longer(object) %>%
head()
#> variable_id sample_id value
#> 1 M136T55_2_POS Blank_3 NA
#> 2 M136T55_2_POS Blank_4 NA
#> 3 M136T55_2_POS QC_1 1857925
#> 4 M136T55_2_POS QC_2 1037764
#> 5 M136T55_2_POS PS4P1 1494436
#> 6 M136T55_2_POS PS4P2 3496912
sessionInfo()
#> R version 4.1.2 (2021-11-01)
#> Platform: x86_64-apple-darwin17.0 (64-bit)
#> Running under: macOS Big Sur 10.16
#>
#> Matrix products: default
#> BLAS: /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRblas.0.dylib
#> LAPACK: /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRlapack.dylib
#>
#> locale:
#> [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
#>
#> attached base packages:
#> [1] stats graphics grDevices utils datasets methods base
#>
#> other attached packages:
#> [1] forcats_0.5.1.9000 stringr_1.4.0 dplyr_1.0.8
#> [4] purrr_0.3.4 readr_2.1.2 tidyr_1.2.0
#> [7] tibble_3.1.6 tidyverse_1.3.1 tinytools_0.9.1
#> [10] ggplot2_3.3.5 magrittr_2.0.2 masstools_0.99.5
#> [13] massdataset_0.99.22
#>
#> loaded via a namespace (and not attached):
#> [1] colorspace_2.0-2 rjson_0.2.21 ellipsis_0.3.2
#> [4] leaflet_2.1.0 rprojroot_2.0.2 circlize_0.4.14
#> [7] GlobalOptions_0.1.2 fs_1.5.2 clue_0.3-60
#> [10] rstudioapi_0.13 mzR_2.28.0 affyio_1.64.0
#> [13] lubridate_1.8.0 fansi_1.0.2 xml2_1.3.3
#> [16] codetools_0.2-18 ncdf4_1.19 doParallel_1.0.17
#> [19] cachem_1.0.6 impute_1.68.0 knitr_1.37
#> [22] jsonlite_1.7.3 broom_0.7.12 dbplyr_2.1.1
#> [25] cluster_2.1.2 vsn_3.62.0 png_0.1-7
#> [28] BiocManager_1.30.16 compiler_4.1.2 httr_1.4.2
#> [31] backports_1.4.1 assertthat_0.2.1 fastmap_1.1.0
#> [34] lazyeval_0.2.2 limma_3.50.0 cli_3.2.0
#> [37] htmltools_0.5.2 tools_4.1.2 gtable_0.3.0
#> [40] glue_1.6.1 affy_1.72.0 Rcpp_1.0.8
#> [43] MALDIquant_1.21 Biobase_2.54.0 cellranger_1.1.0
#> [46] jquerylib_0.1.4 pkgdown_2.0.2 vctrs_0.3.8
#> [49] preprocessCore_1.56.0 iterators_1.0.14 crosstalk_1.2.0
#> [52] xfun_0.29 rvest_1.0.2 openxlsx_4.2.5
#> [55] lifecycle_1.0.1 XML_3.99-0.8 MASS_7.3-55
#> [58] zlibbioc_1.40.0 scales_1.1.1 MSnbase_2.20.4
#> [61] ragg_1.2.1 pcaMethods_1.86.0 hms_1.1.1
#> [64] ProtGenerics_1.26.0 parallel_4.1.2 RColorBrewer_1.1-2
#> [67] ComplexHeatmap_2.10.0 yaml_2.3.4 memoise_2.0.1
#> [70] pbapply_1.5-0 yulab.utils_0.0.4 sass_0.4.0
#> [73] stringi_1.7.6 S4Vectors_0.32.3 desc_1.4.0
#> [76] foreach_1.5.2 BiocGenerics_0.40.0 zip_2.2.0
#> [79] BiocParallel_1.28.3 shape_1.4.6 rlang_1.0.1
#> [82] pkgconfig_2.0.3 systemfonts_1.0.3 matrixStats_0.61.0
#> [85] mzID_1.32.0 evaluate_0.15 lattice_0.20-45
#> [88] htmlwidgets_1.5.4 tidyselect_1.1.1 ggsci_2.9
#> [91] plyr_1.8.6 R6_2.5.1 IRanges_2.28.0
#> [94] generics_0.1.2 DBI_1.1.2 haven_2.4.3
#> [97] pillar_1.7.0 withr_2.4.3 MsCoreUtils_1.6.0
#> [100] modelr_0.1.8 crayon_1.5.0 utf8_1.2.2
#> [103] plotly_4.10.0 tzdb_0.2.0 rmarkdown_2.11
#> [106] GetoptLong_1.0.5 grid_4.1.2 readxl_1.3.1
#> [109] data.table_1.14.2 reprex_2.0.1 digest_0.6.29
#> [112] gridGraphics_0.5-1 textshaping_0.3.6 stats4_4.1.2
#> [115] munsell_0.5.0 viridisLite_0.4.0 ggplotify_0.1.0
#> [118] bslib_0.3.1