Filter samples/variables based on the conditions
filter_samples(object, flist, prune = TRUE, apply_to = "all")
filter_variables(
object,
flist,
prune = TRUE,
apply_to = "all",
according_to_samples = "all"
)
(required) mass_dataset class object.
(required) A function or list of functions that take a vector of abundance values and return a logical.
(optional) A logical. Default FALSE
. If TRUE
, then
the function returns the pruned mass_dataset-class object, rather
than the logical vector of variables that passed the filter.
(required) what variables you want to apply this function. Default is "all". If you only want to apply to specific variables, please set it as a vector of sample names. Other variables will be set as TRUE.
(required) What samples used to filter variables. Default is "all". If you want to use only several samples, provide they names as a vector.
A logical vector equal to the number of samples/variables in mass_dataset-class.
Alternatively, if prune==TRUE
, the pruned mass_dataset-class
object is returned instead.
data("expression_data")
data("sample_info")
data("variable_info")
object =
create_mass_dataset(
expression_data = expression_data,
sample_info = sample_info,
variable_info = variable_info,
)
filter_samples(object, function(x) {
sum(is.na(x)) / length(x) < 0.4
})
#> --------------------
#> massdataset version: 1.0.12
#> --------------------
#> 1.expression_data:[ 1000 x 2 data.frame]
#> 2.sample_info:[ 2 x 4 data.frame]
#> 3.variable_info:[ 1000 x 3 data.frame]
#> 4.sample_info_note:[ 4 x 2 data.frame]
#> 5.variable_info_note:[ 3 x 2 data.frame]
#> 6.ms2_data:[ 0 variables x 0 MS2 spectra]
#> --------------------
#> Processing information (extract_process_info())
#> 2 processings in total
#> create_mass_dataset ----------
#> Package Function.used Time
#> 1 massdataset create_mass_dataset() 2022-08-07 19:33:03
#> filter_samples ----------
#> Package Function.used Time
#> 1 massdataset filter_samples() 2022-08-07 19:33:03
filter_samples(object, function(x) {
sum(is.na(x)) / length(x) < 0.4
}, prune = FALSE)
#> Blank_3 Blank_4 QC_1 QC_2 PS4P1 PS4P2 PS4P3 PS4P4
#> FALSE FALSE TRUE TRUE FALSE FALSE FALSE FALSE
##only apply to Subject sample
object2 =
filter_samples(
object = object,
flist = function(x) {
sum(is.na(x))/length(x) < 0.2
},
prune = TRUE,
apply_to = get_sample_id(object)[extract_sample_info(object)$class == "Subject"]
)
object2
#> --------------------
#> massdataset version: 1.0.12
#> --------------------
#> 1.expression_data:[ 1000 x 4 data.frame]
#> 2.sample_info:[ 4 x 4 data.frame]
#> 3.variable_info:[ 1000 x 3 data.frame]
#> 4.sample_info_note:[ 4 x 2 data.frame]
#> 5.variable_info_note:[ 3 x 2 data.frame]
#> 6.ms2_data:[ 0 variables x 0 MS2 spectra]
#> --------------------
#> Processing information (extract_process_info())
#> 2 processings in total
#> create_mass_dataset ----------
#> Package Function.used Time
#> 1 massdataset create_mass_dataset() 2022-08-07 19:33:03
#> filter_samples ----------
#> Package Function.used Time
#> 1 massdataset filter_samples() 2022-08-07 19:33:03
library(tidyverse)
#> ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
#> ✔ ggplot2 3.3.6 ✔ purrr 0.3.4
#> ✔ tibble 3.1.7 ✔ dplyr 1.0.9
#> ✔ tidyr 1.2.0 ✔ stringr 1.4.0
#> ✔ readr 2.1.2 ✔ forcats 0.5.1.9000
#> ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
#> ✖ tibble::add_column() masks massdataset::add_column()
#> ✖ dplyr::filter() masks massdataset::filter(), stats::filter()
#> ✖ dplyr::lag() masks stats::lag()
data("expression_data")
data("sample_info")
data("variable_info")
object =
create_mass_dataset(
expression_data = expression_data,
sample_info = sample_info,
variable_info = variable_info,
)
object
#> --------------------
#> massdataset version: 1.0.12
#> --------------------
#> 1.expression_data:[ 1000 x 8 data.frame]
#> 2.sample_info:[ 8 x 4 data.frame]
#> 3.variable_info:[ 1000 x 3 data.frame]
#> 4.sample_info_note:[ 4 x 2 data.frame]
#> 5.variable_info_note:[ 3 x 2 data.frame]
#> 6.ms2_data:[ 0 variables x 0 MS2 spectra]
#> --------------------
#> Processing information (extract_process_info())
#> 1 processings in total
#> create_mass_dataset ----------
#> Package Function.used Time
#> 1 massdataset create_mass_dataset() 2022-08-07 19:33:04
####Filter variables which have more than 50% MVs in all samples.
library(tidyverse)
filter_variables(object, function(x) {
sum(is.na(x)) / length(x) < 0.5
}, prune = FALSE) %>%
head()
#> M136T55_2_POS M79T35_POS M307T548_POS M183T224_POS M349T47_POS
#> TRUE TRUE TRUE FALSE TRUE
#> M182T828_POS
#> TRUE
filter_variables(object, function(x) {
sum(is.na(x)) / length(x) < 0.5
},
prune = TRUE)
#> --------------------
#> massdataset version: 1.0.12
#> --------------------
#> 1.expression_data:[ 422 x 8 data.frame]
#> 2.sample_info:[ 8 x 4 data.frame]
#> 3.variable_info:[ 422 x 3 data.frame]
#> 4.sample_info_note:[ 4 x 2 data.frame]
#> 5.variable_info_note:[ 3 x 2 data.frame]
#> 6.ms2_data:[ 0 variables x 0 MS2 spectra]
#> --------------------
#> Processing information (extract_process_info())
#> 2 processings in total
#> create_mass_dataset ----------
#> Package Function.used Time
#> 1 massdataset create_mass_dataset() 2022-08-07 19:33:04
#> filter_variables ----------
#> Package Function.used Time
#> 1 massdataset filter_variables() 2022-08-07 19:33:04
####Filter variables which have more than 50% MVs in only QC samples.
filter_variables(
object,
flist = function(x) {
sum(is.na(x)) / length(x) < 0.5
},
prune = TRUE,
according_to_samples =
get_sample_id(object)[extract_sample_info(object)$class == "QC"]
)
#> --------------------
#> massdataset version: 1.0.12
#> --------------------
#> 1.expression_data:[ 496 x 8 data.frame]
#> 2.sample_info:[ 8 x 4 data.frame]
#> 3.variable_info:[ 496 x 3 data.frame]
#> 4.sample_info_note:[ 4 x 2 data.frame]
#> 5.variable_info_note:[ 3 x 2 data.frame]
#> 6.ms2_data:[ 0 variables x 0 MS2 spectra]
#> --------------------
#> Processing information (extract_process_info())
#> 2 processings in total
#> create_mass_dataset ----------
#> Package Function.used Time
#> 1 massdataset create_mass_dataset() 2022-08-07 19:33:04
#> filter_variables ----------
#> Package Function.used Time
#> 1 massdataset filter_variables() 2022-08-07 19:33:04
####Filter variables which have more than 50% MVs in QC or subject samples.
idx1 =
filter_variables(
object,
flist = function(x) {
sum(is.na(x)) / length(x) < 0.5
},
prune = FALSE,
according_to_samples =
get_sample_id(object)[extract_sample_info(object)$class == "QC"]
)
idx2 =
filter_variables(
object,
flist = function(x) {
sum(is.na(x)) / length(x) < 0.5
},
prune = FALSE,
according_to_samples =
get_sample_id(object)[extract_sample_info(object)$class == "Subject"]
)
idx =
which(idx1 | idx2)
object2 = object[idx,]
object2
#> --------------------
#> massdataset version: 1.0.12
#> --------------------
#> 1.expression_data:[ 642 x 8 data.frame]
#> 2.sample_info:[ 8 x 4 data.frame]
#> 3.variable_info:[ 642 x 3 data.frame]
#> 4.sample_info_note:[ 4 x 2 data.frame]
#> 5.variable_info_note:[ 3 x 2 data.frame]
#> 6.ms2_data:[ 0 variables x 0 MS2 spectra]
#> --------------------
#> Processing information (extract_process_info())
#> 2 processings in total
#> create_mass_dataset ----------
#> Package Function.used Time
#> 1 massdataset create_mass_dataset() 2022-08-07 19:33:04
#> subset ----------
#> Package Function.used Time
#> 1 massdataset [ 2022-08-07 19:33:04
####filter variables with RSD (in QC samples) < 30
object3 =
filter_variables(
object = object,
flist = function(x) {
rsd = sd(x) * 100 / mean(x)
rsd = ifelse(is.na(rsd), 100, rsd)
rsd < 30
},
apply_to = "all",
prune = TRUE,
according_to_samples = get_sample_id(object)[extract_sample_info(object)$class == "QC"]
)
object3
#> --------------------
#> massdataset version: 1.0.12
#> --------------------
#> 1.expression_data:[ 328 x 8 data.frame]
#> 2.sample_info:[ 8 x 4 data.frame]
#> 3.variable_info:[ 328 x 3 data.frame]
#> 4.sample_info_note:[ 4 x 2 data.frame]
#> 5.variable_info_note:[ 3 x 2 data.frame]
#> 6.ms2_data:[ 0 variables x 0 MS2 spectra]
#> --------------------
#> Processing information (extract_process_info())
#> 2 processings in total
#> create_mass_dataset ----------
#> Package Function.used Time
#> 1 massdataset create_mass_dataset() 2022-08-07 19:33:04
#> filter_variables ----------
#> Package Function.used Time
#> 1 massdataset filter_variables() 2022-08-07 19:33:04