Filter samples/variables based on the conditions
filter_samples(object, flist, prune = TRUE, apply_to = "all")
filter_variables(
object,
flist,
prune = TRUE,
apply_to = "all",
according_to_samples = "all"
)
(required) mass_dataset class object.
(required) A function or list of functions that take a vector of abundance values and return a logical.
(optional) A logical. Default FALSE
. If TRUE
, then
the function returns the pruned mass_dataset-class object, rather
than the logical vector of variables that passed the filter.
(required) what variables you want to apply this function. Default is "all". If you only want to apply to specific variables, please set it as a vector of sample names. Other variables will be set as TRUE.
(required) What samples used to filter variables. Default is "all". If you want to use only several samples, provide they names as a vector.
A logical vector equal to the number of samples/variables in mass_dataset-class.
Alternatively, if prune==TRUE
, the pruned mass_dataset-class
object is returned instead.
data("expression_data")
data("sample_info")
data("variable_info")
object =
create_mass_dataset(
expression_data = expression_data,
sample_info = sample_info,
variable_info = variable_info,
)
filter_samples(object, function(x) {
sum(is.na(x)) / length(x) < 0.4
})
#> --------------------
#> massdataset version: 1.0.33
#> --------------------
#> 1.expression_data:[ 1000 x 2 data.frame]
#> 2.sample_info:[ 2 x 4 data.frame]
#> 2 samples:QC_1 QC_2
#> 3.variable_info:[ 1000 x 3 data.frame]
#> 1000 variables:M136T55_2_POS M79T35_POS M307T548_POS ... M232T937_POS M301T277_POS
#> 4.sample_info_note:[ 4 x 2 data.frame]
#> 5.variable_info_note:[ 3 x 2 data.frame]
#> 6.ms2_data:[ 0 variables x 0 MS2 spectra]
#> --------------------
#> Processing information
#> 2 processings in total
#> create_mass_dataset ----------
#> Package Function.used Time
#> 1 massdataset create_mass_dataset() 2024-09-06 08:49:48
#> filter_samples ----------
#> Package Function.used Time
#> 1 massdataset filter_samples() 2024-09-06 08:49:48
filter_samples(object, function(x) {
sum(is.na(x)) / length(x) < 0.4
}, prune = FALSE)
#> Blank_3 Blank_4 QC_1 QC_2 PS4P1 PS4P2 PS4P3 PS4P4
#> FALSE FALSE TRUE TRUE FALSE FALSE FALSE FALSE
##only apply to Subject sample
object2 =
filter_samples(
object = object,
flist = function(x) {
sum(is.na(x))/length(x) < 0.2
},
prune = TRUE,
apply_to = get_sample_id(object)[extract_sample_info(object)$class == "Subject"]
)
object2
#> --------------------
#> massdataset version: 1.0.33
#> --------------------
#> 1.expression_data:[ 1000 x 4 data.frame]
#> 2.sample_info:[ 4 x 4 data.frame]
#> 4 samples:Blank_3 Blank_4 QC_1 QC_2
#> 3.variable_info:[ 1000 x 3 data.frame]
#> 1000 variables:M136T55_2_POS M79T35_POS M307T548_POS ... M232T937_POS M301T277_POS
#> 4.sample_info_note:[ 4 x 2 data.frame]
#> 5.variable_info_note:[ 3 x 2 data.frame]
#> 6.ms2_data:[ 0 variables x 0 MS2 spectra]
#> --------------------
#> Processing information
#> 2 processings in total
#> create_mass_dataset ----------
#> Package Function.used Time
#> 1 massdataset create_mass_dataset() 2024-09-06 08:49:48
#> filter_samples ----------
#> Package Function.used Time
#> 1 massdataset filter_samples() 2024-09-06 08:49:48
library(tidyverse)
#> ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
#> ✔ forcats 1.0.0 ✔ stringr 1.5.1
#> ✔ lubridate 1.9.3 ✔ tibble 3.2.1
#> ✔ purrr 1.0.2 ✔ tidyr 1.3.1
#> ✔ readr 2.1.5
#> ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
#> ✖ tidyr::extract() masks magrittr::extract()
#> ✖ dplyr::filter() masks massdataset::filter(), stats::filter()
#> ✖ dplyr::lag() masks stats::lag()
#> ✖ purrr::set_names() masks magrittr::set_names()
#> ℹ Use the conflicted package to force all conflicts to become errors
data("expression_data")
data("sample_info")
data("variable_info")
object =
create_mass_dataset(
expression_data = expression_data,
sample_info = sample_info,
variable_info = variable_info,
)
object
#> --------------------
#> massdataset version: 1.0.33
#> --------------------
#> 1.expression_data:[ 1000 x 8 data.frame]
#> 2.sample_info:[ 8 x 4 data.frame]
#> 8 samples:Blank_3 Blank_4 QC_1 ... PS4P3 PS4P4
#> 3.variable_info:[ 1000 x 3 data.frame]
#> 1000 variables:M136T55_2_POS M79T35_POS M307T548_POS ... M232T937_POS M301T277_POS
#> 4.sample_info_note:[ 4 x 2 data.frame]
#> 5.variable_info_note:[ 3 x 2 data.frame]
#> 6.ms2_data:[ 0 variables x 0 MS2 spectra]
#> --------------------
#> Processing information
#> 1 processings in total
#> create_mass_dataset ----------
#> Package Function.used Time
#> 1 massdataset create_mass_dataset() 2024-09-06 08:49:49
####Filter variables which have more than 50% MVs in all samples.
library(tidyverse)
filter_variables(object, function(x) {
sum(is.na(x)) / length(x) < 0.5
}, prune = FALSE) %>%
head()
#> M136T55_2_POS M79T35_POS M307T548_POS M183T224_POS M349T47_POS
#> TRUE TRUE TRUE FALSE TRUE
#> M182T828_POS
#> TRUE
filter_variables(object, function(x) {
sum(is.na(x)) / length(x) < 0.5
},
prune = TRUE)
#> --------------------
#> massdataset version: 1.0.33
#> --------------------
#> 1.expression_data:[ 422 x 8 data.frame]
#> 2.sample_info:[ 8 x 4 data.frame]
#> 8 samples:Blank_3 Blank_4 QC_1 ... PS4P3 PS4P4
#> 3.variable_info:[ 422 x 3 data.frame]
#> 422 variables:M136T55_2_POS M79T35_POS M307T548_POS ... M236T543_POS M232T937_POS
#> 4.sample_info_note:[ 4 x 2 data.frame]
#> 5.variable_info_note:[ 3 x 2 data.frame]
#> 6.ms2_data:[ 0 variables x 0 MS2 spectra]
#> --------------------
#> Processing information
#> 2 processings in total
#> create_mass_dataset ----------
#> Package Function.used Time
#> 1 massdataset create_mass_dataset() 2024-09-06 08:49:49
#> filter_variables ----------
#> Package Function.used Time
#> 1 massdataset filter_variables() 2024-09-06 08:49:49
####Filter variables which have more than 50% MVs in only QC samples.
filter_variables(
object,
flist = function(x) {
sum(is.na(x)) / length(x) < 0.5
},
prune = TRUE,
according_to_samples =
get_sample_id(object)[extract_sample_info(object)$class == "QC"]
)
#> --------------------
#> massdataset version: 1.0.33
#> --------------------
#> 1.expression_data:[ 496 x 8 data.frame]
#> 2.sample_info:[ 8 x 4 data.frame]
#> 8 samples:Blank_3 Blank_4 QC_1 ... PS4P3 PS4P4
#> 3.variable_info:[ 496 x 3 data.frame]
#> 496 variables:M136T55_2_POS M79T35_POS M307T548_POS ... M361T681_POS M236T543_POS
#> 4.sample_info_note:[ 4 x 2 data.frame]
#> 5.variable_info_note:[ 3 x 2 data.frame]
#> 6.ms2_data:[ 0 variables x 0 MS2 spectra]
#> --------------------
#> Processing information
#> 2 processings in total
#> create_mass_dataset ----------
#> Package Function.used Time
#> 1 massdataset create_mass_dataset() 2024-09-06 08:49:49
#> filter_variables ----------
#> Package Function.used Time
#> 1 massdataset filter_variables() 2024-09-06 08:49:49
####Filter variables which have more than 50% MVs in QC or subject samples.
idx1 =
filter_variables(
object,
flist = function(x) {
sum(is.na(x)) / length(x) < 0.5
},
prune = FALSE,
according_to_samples =
get_sample_id(object)[extract_sample_info(object)$class == "QC"]
)
idx2 =
filter_variables(
object,
flist = function(x) {
sum(is.na(x)) / length(x) < 0.5
},
prune = FALSE,
according_to_samples =
get_sample_id(object)[extract_sample_info(object)$class == "Subject"]
)
idx =
which(idx1 | idx2)
object2 = object[idx,]
object2
#> --------------------
#> massdataset version: 1.0.33
#> --------------------
#> 1.expression_data:[ 642 x 8 data.frame]
#> 2.sample_info:[ 8 x 4 data.frame]
#> 8 samples:Blank_3 Blank_4 QC_1 ... PS4P3 PS4P4
#> 3.variable_info:[ 642 x 3 data.frame]
#> 642 variables:M136T55_2_POS M79T35_POS M307T548_POS ... M232T937_POS M301T277_POS
#> 4.sample_info_note:[ 4 x 2 data.frame]
#> 5.variable_info_note:[ 3 x 2 data.frame]
#> 6.ms2_data:[ 0 variables x 0 MS2 spectra]
#> --------------------
#> Processing information
#> 2 processings in total
#> create_mass_dataset ----------
#> Package Function.used Time
#> 1 massdataset create_mass_dataset() 2024-09-06 08:49:49
#> subset ----------
#> Package Function.used Time
#> 1 massdataset [ 2024-09-06 08:49:49
####filter variables with RSD (in QC samples) < 30
object3 =
filter_variables(
object = object,
flist = function(x) {
rsd = sd(x) * 100 / mean(x)
rsd = ifelse(is.na(rsd), 100, rsd)
rsd < 30
},
apply_to = "all",
prune = TRUE,
according_to_samples = get_sample_id(object)[extract_sample_info(object)$class == "QC"]
)
object3
#> --------------------
#> massdataset version: 1.0.33
#> --------------------
#> 1.expression_data:[ 328 x 8 data.frame]
#> 2.sample_info:[ 8 x 4 data.frame]
#> 8 samples:Blank_3 Blank_4 QC_1 ... PS4P3 PS4P4
#> 3.variable_info:[ 328 x 3 data.frame]
#> 328 variables:M307T548_POS M299T359_POS M344T471_POS ... M361T681_POS M236T543_POS
#> 4.sample_info_note:[ 4 x 2 data.frame]
#> 5.variable_info_note:[ 3 x 2 data.frame]
#> 6.ms2_data:[ 0 variables x 0 MS2 spectra]
#> --------------------
#> Processing information
#> 2 processings in total
#> create_mass_dataset ----------
#> Package Function.used Time
#> 1 massdataset create_mass_dataset() 2024-09-06 08:49:49
#> filter_variables ----------
#> Package Function.used Time
#> 1 massdataset filter_variables() 2024-09-06 08:49:49