-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlimma_best_model.R
executable file
·109 lines (91 loc) · 4.86 KB
/
limma_best_model.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
############################################################################################
# Copyright (c) 2022 - Mucosal Immunology Lab, Monash University, Melbourne, Australia #
# Author: Matthew Macowan #
# This script is provided under the MIT licence (see LICENSE.txt for details) #
############################################################################################
# Define a function to select the best model formula from a selection of input parameters
limma_best_model <- function(input_data, key_variable = NULL, other_parameters = NULL, selection_metric = 'bic') {
# Load required packages
library(pacman)
pkgs <- c('BiocGenerics', 'base', 'biomformat', 'IRanges', 'S4Vectors', 'dplyr', 'plotly',
'ggplot2', 'ggtree', 'ggpubr', 'rstatix', 'Matrix', 'limma', 'phyloseq', 'stats',
'stringr', 'utils', 'purrr')
pacman::p_load(char = pkgs)
# Perform sanity checks
if (is.null(key_variable)) {stop('Please provide the key variable you are interested in testing downstream.')}
if (is.null(other_parameters)) {stop('Please provide additional variables for testing for the best model fit.')}
# Define a function to ensure that character 'NA' values are switched with real 'NA' values
ensure_NA <- function(vector) {
replace(vector, vector == 'NA', NA)
}
# Remove samples with NA values for the key_variable
if (class(input_data) == 'phyloseq') {
input_data <- prune_samples(!is.na(ensure_NA(sample_data(input_data)[[key_variable]])), input_data)
# Retrieve sample data and OTU table data
model_matrix_data <- data.frame(sample_data(input_data))
count_data <- data.frame(otu_table(input_data))
}
if (class(input_data) == 'SummarizedExperiment') {
input_data <- input_data[, !is.na(ensure_NA(input_data@metadata$metadata[[key_variable]]))]
input_metadata <- input_data@metadata$metadata[!is.na(ensure_NA(input_data@metadata$metadata[[key_variable]])),]
if (class(input_metadata[[key_variable]]) == 'factor') {
input_metadata[[key_variable]] <- droplevels(input_metadata[[key_variable]])
}
# Retrieve sample data and counts data
model_matrix_data <- data.frame(input_metadata)
count_data <- data.frame(assay(input_data))
}
# Prepare the combinations of test parameters provided
combinations <- flatten(lapply(seq_along(c(key_variable, other_parameters)), function(x) combn(c(key_variable, other_parameters), x, FUN = list)))
for (i in seq_along(combinations)) {
combinations[[i]] <- paste(combinations[[i]], collapse = ' + ')
}
combinations <- unlist(combinations)
# Prepare model matrixes for each combination
model_list <- list()
for (i in seq_along(combinations)) {
formula <- as.formula(paste0('~', combinations[i]))
model <- model.matrix(formula, data = model_matrix_data)
model_list[[i]] <- model
}
# Check for the best model
model_scores <- data.frame(table(limma::selectModel(count_data, model_list, criterion = selection_metric)$pref),
model = combinations) %>%
mutate(best = ifelse(Freq == max(Freq), 'Best', 'Other')) %>%
arrange(desc(Freq)) %>%
mutate(formula_length = unlist(lapply((str_split(as.character(model), ' \\+ ')), length))) %>%
mutate(model = factor(model, levels = model),
freq_best_perc = Freq / sum(Freq) * 100)
colnames(model_scores) <- c('model_number', 'freq_as_best', 'Model', 'Best', 'formula_length', 'freq_best_perc')
model_scores2 <- model_scores %>%
arrange(freq_best_perc) %>%
mutate(Model = factor(Model, levels = Model))
if (nrow(model_scores2) > 10) {
model_scores2 <- slice_tail(model_scores2, n = 10)
}
plot_subtitle <- case_when(
selection_metric == 'aic' ~ 'Akaike\'s Information Criterion (AIC)',
selection_metric == 'bic' ~ 'Bayesian Information Criterion (BIC)'
)
# Plot the AIC scores
scores_plot <- ggplot(model_scores2, aes(x = freq_best_perc, y = Model)) +
geom_col(aes(fill = Best)) +
scale_fill_manual(values = c('Best' = '#1261A0', 'Other' = 'grey60')) +
guides(fill = 'none') +
labs(title = 'Best Fitting Model',
subtitle = plot_subtitle,
x = 'Model is Best for this\nPercentage of Features')
print(scores_plot)
# Retrieve the best model
best_model <- paste0('~ ', model_scores$Model[1])
if (!str_detect(best_model, key_variable)) {
best_model <- paste0(best_model, ' + ', key_variable)
}
best_model_matrix <- model.matrix(as.formula(best_model), data = model_matrix_data)
# Return data or graph
return(list(formula_string = best_model,
key_var_coef = which(str_detect(colnames(best_model_matrix), key_variable)),
scores_plot = scores_plot,
model_scores = model_scores,
selection_metric = str_to_upper(selection_metric)))
}