Filter samples/variables based on the conditions

filter_samples(object, flist, prune = TRUE, apply_to = "all")

filter_variables(
  object,
  flist,
  prune = TRUE,
  apply_to = "all",
  according_to_samples = "all"
)

Arguments

object

(required) mass_dataset class object.

flist

(required) A function or list of functions that take a vector of abundance values and return a logical.

prune

(optional) A logical. Default FALSE. If TRUE, then the function returns the pruned mass_dataset-class object, rather than the logical vector of variables that passed the filter.

apply_to

(required) what variables you want to apply this function. Default is "all". If you only want to apply to specific variables, please set it as a vector of sample names. Other variables will be set as TRUE.

according_to_samples

(required) What samples used to filter variables. Default is "all". If you want to use only several samples, provide they names as a vector.

Value

A logical vector equal to the number of samples/variables in mass_dataset-class. Alternatively, if prune==TRUE, the pruned mass_dataset-class object is returned instead.

Author

Xiaotao Shen shenxt1990@outlook.com

Examples

data("expression_data")
data("sample_info")
data("variable_info")

object =
  create_mass_dataset(
    expression_data = expression_data,
    sample_info = sample_info,
    variable_info = variable_info,
  )
 
filter_samples(object, function(x) {
  sum(is.na(x)) / length(x) < 0.4
})
#> -------------------- 
#> massdataset version: 1.0.12 
#> -------------------- 
#> 1.expression_data:[ 1000 x 2 data.frame]
#> 2.sample_info:[ 2 x 4 data.frame]
#> 3.variable_info:[ 1000 x 3 data.frame]
#> 4.sample_info_note:[ 4 x 2 data.frame]
#> 5.variable_info_note:[ 3 x 2 data.frame]
#> 6.ms2_data:[ 0 variables x 0 MS2 spectra]
#> -------------------- 
#> Processing information (extract_process_info())
#> 2 processings in total
#> create_mass_dataset ---------- 
#>       Package         Function.used                Time
#> 1 massdataset create_mass_dataset() 2022-08-07 19:33:03
#> filter_samples ---------- 
#>       Package    Function.used                Time
#> 1 massdataset filter_samples() 2022-08-07 19:33:03

filter_samples(object, function(x) {
  sum(is.na(x)) / length(x) < 0.4
}, prune = FALSE)
#> Blank_3 Blank_4    QC_1    QC_2   PS4P1   PS4P2   PS4P3   PS4P4 
#>   FALSE   FALSE    TRUE    TRUE   FALSE   FALSE   FALSE   FALSE 

##only apply to Subject sample
object2 =
filter_samples(
 object = object,
 flist = function(x) {
   sum(is.na(x))/length(x) < 0.2
 },
 prune = TRUE,
 apply_to = get_sample_id(object)[extract_sample_info(object)$class == "Subject"]
)

object2
#> -------------------- 
#> massdataset version: 1.0.12 
#> -------------------- 
#> 1.expression_data:[ 1000 x 4 data.frame]
#> 2.sample_info:[ 4 x 4 data.frame]
#> 3.variable_info:[ 1000 x 3 data.frame]
#> 4.sample_info_note:[ 4 x 2 data.frame]
#> 5.variable_info_note:[ 3 x 2 data.frame]
#> 6.ms2_data:[ 0 variables x 0 MS2 spectra]
#> -------------------- 
#> Processing information (extract_process_info())
#> 2 processings in total
#> create_mass_dataset ---------- 
#>       Package         Function.used                Time
#> 1 massdataset create_mass_dataset() 2022-08-07 19:33:03
#> filter_samples ---------- 
#>       Package    Function.used                Time
#> 1 massdataset filter_samples() 2022-08-07 19:33:03
library(tidyverse)
#> ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
#>  ggplot2 3.3.6           purrr   0.3.4     
#>  tibble  3.1.7           dplyr   1.0.9     
#>  tidyr   1.2.0           stringr 1.4.0     
#>  readr   2.1.2           forcats 0.5.1.9000
#> ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
#>  tibble::add_column() masks massdataset::add_column()
#>  dplyr::filter()      masks massdataset::filter(), stats::filter()
#>  dplyr::lag()         masks stats::lag()
data("expression_data")
data("sample_info")
data("variable_info")

object =
  create_mass_dataset(
    expression_data = expression_data,
    sample_info = sample_info,
    variable_info = variable_info,
  )
object
#> -------------------- 
#> massdataset version: 1.0.12 
#> -------------------- 
#> 1.expression_data:[ 1000 x 8 data.frame]
#> 2.sample_info:[ 8 x 4 data.frame]
#> 3.variable_info:[ 1000 x 3 data.frame]
#> 4.sample_info_note:[ 4 x 2 data.frame]
#> 5.variable_info_note:[ 3 x 2 data.frame]
#> 6.ms2_data:[ 0 variables x 0 MS2 spectra]
#> -------------------- 
#> Processing information (extract_process_info())
#> 1 processings in total
#> create_mass_dataset ---------- 
#>       Package         Function.used                Time
#> 1 massdataset create_mass_dataset() 2022-08-07 19:33:04

####Filter variables which have more than 50% MVs in all samples.
library(tidyverse)
filter_variables(object, function(x) {
  sum(is.na(x)) / length(x) < 0.5
}, prune = FALSE) %>%
  head()
#> M136T55_2_POS    M79T35_POS  M307T548_POS  M183T224_POS   M349T47_POS 
#>          TRUE          TRUE          TRUE         FALSE          TRUE 
#>  M182T828_POS 
#>          TRUE 

filter_variables(object, function(x) {
  sum(is.na(x)) / length(x) < 0.5
},
prune = TRUE)
#> -------------------- 
#> massdataset version: 1.0.12 
#> -------------------- 
#> 1.expression_data:[ 422 x 8 data.frame]
#> 2.sample_info:[ 8 x 4 data.frame]
#> 3.variable_info:[ 422 x 3 data.frame]
#> 4.sample_info_note:[ 4 x 2 data.frame]
#> 5.variable_info_note:[ 3 x 2 data.frame]
#> 6.ms2_data:[ 0 variables x 0 MS2 spectra]
#> -------------------- 
#> Processing information (extract_process_info())
#> 2 processings in total
#> create_mass_dataset ---------- 
#>       Package         Function.used                Time
#> 1 massdataset create_mass_dataset() 2022-08-07 19:33:04
#> filter_variables ---------- 
#>       Package      Function.used                Time
#> 1 massdataset filter_variables() 2022-08-07 19:33:04

####Filter variables which have more than 50% MVs in only QC samples.
filter_variables(
  object,
  flist = function(x) {
    sum(is.na(x)) / length(x) < 0.5
  },
  prune = TRUE,
  according_to_samples =
    get_sample_id(object)[extract_sample_info(object)$class == "QC"]
)
#> -------------------- 
#> massdataset version: 1.0.12 
#> -------------------- 
#> 1.expression_data:[ 496 x 8 data.frame]
#> 2.sample_info:[ 8 x 4 data.frame]
#> 3.variable_info:[ 496 x 3 data.frame]
#> 4.sample_info_note:[ 4 x 2 data.frame]
#> 5.variable_info_note:[ 3 x 2 data.frame]
#> 6.ms2_data:[ 0 variables x 0 MS2 spectra]
#> -------------------- 
#> Processing information (extract_process_info())
#> 2 processings in total
#> create_mass_dataset ---------- 
#>       Package         Function.used                Time
#> 1 massdataset create_mass_dataset() 2022-08-07 19:33:04
#> filter_variables ---------- 
#>       Package      Function.used                Time
#> 1 massdataset filter_variables() 2022-08-07 19:33:04

####Filter variables which have more than 50% MVs in QC or subject samples.
idx1 =
  filter_variables(
    object,
    flist = function(x) {
      sum(is.na(x)) / length(x) < 0.5
    },
    prune = FALSE,
    according_to_samples =
      get_sample_id(object)[extract_sample_info(object)$class == "QC"]
  )

idx2 =
  filter_variables(
    object,
    flist = function(x) {
      sum(is.na(x)) / length(x) < 0.5
    },
    prune = FALSE,
    according_to_samples =
      get_sample_id(object)[extract_sample_info(object)$class == "Subject"]
  )

idx =
  which(idx1 | idx2)

object2 = object[idx,]

object2
#> -------------------- 
#> massdataset version: 1.0.12 
#> -------------------- 
#> 1.expression_data:[ 642 x 8 data.frame]
#> 2.sample_info:[ 8 x 4 data.frame]
#> 3.variable_info:[ 642 x 3 data.frame]
#> 4.sample_info_note:[ 4 x 2 data.frame]
#> 5.variable_info_note:[ 3 x 2 data.frame]
#> 6.ms2_data:[ 0 variables x 0 MS2 spectra]
#> -------------------- 
#> Processing information (extract_process_info())
#> 2 processings in total
#> create_mass_dataset ---------- 
#>       Package         Function.used                Time
#> 1 massdataset create_mass_dataset() 2022-08-07 19:33:04
#> subset ---------- 
#>       Package Function.used                Time
#> 1 massdataset             [ 2022-08-07 19:33:04

####filter variables with RSD (in QC samples) < 30
object3 =
filter_variables(
  object = object,
  flist = function(x) {
    rsd = sd(x) * 100 / mean(x)
    rsd = ifelse(is.na(rsd), 100, rsd)
    rsd < 30
  },
  apply_to = "all",
  prune = TRUE,
  according_to_samples = get_sample_id(object)[extract_sample_info(object)$class == "QC"]
)

object3
#> -------------------- 
#> massdataset version: 1.0.12 
#> -------------------- 
#> 1.expression_data:[ 328 x 8 data.frame]
#> 2.sample_info:[ 8 x 4 data.frame]
#> 3.variable_info:[ 328 x 3 data.frame]
#> 4.sample_info_note:[ 4 x 2 data.frame]
#> 5.variable_info_note:[ 3 x 2 data.frame]
#> 6.ms2_data:[ 0 variables x 0 MS2 spectra]
#> -------------------- 
#> Processing information (extract_process_info())
#> 2 processings in total
#> create_mass_dataset ---------- 
#>       Package         Function.used                Time
#> 1 massdataset create_mass_dataset() 2022-08-07 19:33:04
#> filter_variables ---------- 
#>       Package      Function.used                Time
#> 1 massdataset filter_variables() 2022-08-07 19:33:04