diff --git a/docs/images/dag_diablo_design_matrix.png b/docs/images/dag_diablo_design_matrix.png new file mode 100644 index 0000000..de78330 Binary files /dev/null and b/docs/images/dag_diablo_design_matrix.png differ diff --git a/docs/images/dag_transformation.png b/docs/images/dag_transformation.png new file mode 100644 index 0000000..43d697e Binary files /dev/null and b/docs/images/dag_transformation.png differ diff --git a/docs/overview.html b/docs/overview.html index 7d73ffc..132504a 100644 --- a/docs/overview.html +++ b/docs/overview.html @@ -274,6 +274,11 @@
colours_list <- list(
+Code
head(interesting_features) ## vector of feature IDs
+#> [1] "ARS-BFGL-NGS-27468" "BovineHD2300010006" "BovineHD0300000351"
+#> [4] "BovineHD1900011146" "BovineHD0900026231" "ENSBTAG00000022715"
+
+colours_list <- list(
"status" = c("Control" = "gold", "BRD" = "lightblue"),
"day_on_feed" = colorRamp2(c(5, 70), c("white", "pink3")),
"de_status" = c("downregulated" = "deepskyblue",
@@ -350,14 +365,14 @@ mo_set, # the MultiDataSet object
interesting_features, # vector of feature IDs of interest
center = TRUE, # centering and scaling data for
- scale = TRUE, # easier visualisation
- show_column_names = FALSE, # customising the heatmap
+ scale = TRUE, # easier visualisation
+ show_column_names = FALSE, # hide sample IDs
only_common_samples = TRUE, # only samples present in all omics
samples_info = c("status", "day_on_feed"), # add info about samples
features_info = c("de_status"), # add info about features
colours_list = colours_list, # customise colours
label_cols = list( # specify features label
- "rnaseq" = "Name", # from features metadata
+ "rnaseq" = "Name", # from features metadata
"metabolome" = "name"
),
truncate = 20
@@ -369,7 +384,318 @@ plot_samples_upset()
), or generating a density plot for each omics dataset (plot_density_data()
).
1.2 Data pre-processing
-Target factories have been implemented to facilitate the application of similar tasks across the different omics datasets. For example, transformation_datasets_factory()
+Target factories have been implemented to facilitate the application of similar tasks across the different omics datasets. For example, the transformation_datasets_factory()
function generates a sequence of targets to apply one of many possible transformations (from the vsn
, DESeq2
, or bestNormalize
packages, for example) on each omics dataset, store information about each transformation performed, and generate a new MultiDataSet object in which the omics measurements have been transformed:
+
+Code
+
+
+transformation_datasets_factory(
+ mo_set_de, # MultiDataSet object
+ c("rnaseq" = "vst-deseq2", # VST through DESeq2 for RNAseq
+ "metabolome" = "logx"), # log2-transf. for NMR dataset
+ log_bases = 2, # Base for log transformation
+ pre_log_functions = zero_to_half_min, # Handling 0s in log2-transf.
+ transformed_data_name = "mo_set_transformed" # New MultiDataSet object
+)
+
+
+
+Note that there is also the option for users to apply their own custom transformations to the datasets.
+Similarly, the pca_complete_data_factory
generates a list of targets to run a PCA on each omics dataset via the pcaMethods
package, and if necessary imputes missing values through NIPALS-PCA. The PCA results can be easily visualised for all or specific omics datasets:
+
+Code
plot_screeplot_pca(pca_runs_list)
+
+
+
+
+
+Code
plot_samples_coordinates_pca(
+ pca_runs_list, # List of PCA results
+ datasets = "snps", # Dataset to plot
+ pcs = 1:3, # Principal components to display
+ mo_data = mo_set, # MultiDataSet object
+ colour_upper = "geno_comp_cluster", # Samples covariate
+ shape_upper = "status", # Samples covariate
+ colour_lower = "feedlot", # Samples covariate
+ scale_colour_lower = scale_colour_brewer(palette = "Set1") # Custom palette
+) +
+ theme(legend.box = "vertical") # Plot legend vertically
+
+
+
+
+
+1.3 Data pre-filtering
+The created MultiDataSet
object can be filtered in a number of ways, both in terms of samples and features: via a list of sample or feature IDs, or using logical tests on samples or features metadata. In addition, we implement targets factories to retain only the most variable features in each omics dataset (unsupervised filtering), or to retain the features most associated with an outcome of interest, via sPLS-DA from mixOmics
(supervised filtering). This pre-filtering step is essential to reduce the size of the datasets prior to multi-omics integration.
+
+Code
+
+
+feature_preselection_splsda_factory(
+ mo_set_complete, # A MultiDataSet object
+ group = "status", # Sample covariate to use for supervised filtering
+ to_keep_ns = c( # Number of features to retain per dataset
+ "snps" = 1000,
+ "rnaseq" = 1000
+ ),
+ filtered_set_target_name = "mo_presel_supervised" # Name of filtered object
+)
+
+
+
+#> Object of class 'MultiDataSet'
+#> . assayData: 3 elements
+#> . snps: 1000 features, 139 samples
+#> . rnaseq: 994 features, 143 samples
+#> . metabolome: 55 features, 139 samples
+#> . featureData:
+#> . snps: 1000 rows, 13 cols (feature_id, ..., p_value)
+#> . rnaseq: 994 rows, 15 cols (feature_id, ..., de_signif)
+#> . metabolome: 55 rows, 16 cols (feature_id, ..., de_signif)
+#> . rowRanges:
+#> . snps: YES
+#> . rnaseq: YES
+#> . metabolome: NO
+#> . phenoData:
+#> . snps: 139 samples, 10 cols (id, ..., geno_comp_3)
+#> . rnaseq: 143 samples, 10 cols (id, ..., geno_comp_3)
+#> . metabolome: 139 samples, 10 cols (id, ..., geno_comp_3)
+
+
+1.4 Multi-omics data integration
+At the moment, moiraine
provides functions and target factories to facilitate the use of five integration methods: sPLS and DIABLO from the mixOmics
package, sO2PLS from OmicsPLS
, as well as MOFA
and MEFISTO
from MOFA2
.
+This involves providing functions to transform a MultiDataSet
object into the required input format for each integration method; for example for sPLS:
+
+Code
spls_input <- get_input_spls(
+ mo_presel_supervised,
+ mode = "canonical",
+ datasets = c("rnaseq", "metabolome")
+)
+
+map(spls_input, \(x) x[1:5, 1:5])
+
+
+
+#> $rnaseq
+#> ENSBTAG00000000020 ENSBTAG00000000046 ENSBTAG00000000056
+#> R9497 3.486141 7.211634 10.25302
+#> R5969 3.486141 7.436865 10.45008
+#> R5327 4.005139 6.731310 10.72323
+#> R5979 3.486141 7.538956 10.46189
+#> R9504 4.252314 7.493829 10.20723
+#> ENSBTAG00000000061 ENSBTAG00000000113
+#> R9497 5.028442 13.49233
+#> R5969 4.285078 12.95106
+#> R5327 5.313842 13.14427
+#> R5979 4.333030 12.55548
+#> R9504 4.784574 12.71693
+#>
+#> $metabolome
+#> HMDB00001 HMDB00008 HMDB00042 HMDB00043 HMDB00060
+#> R9497 3.397217 3.405992 9.654099 7.559186 0.5849625
+#> R5969 3.318827 2.137504 7.639522 6.539159 2.5849625
+#> R5327 3.721137 6.270529 6.963474 6.024586 2.6322682
+#> R5979 3.326626 2.232661 8.401306 7.394891 2.5849625
+#> R9504 3.603548 5.675251 7.879583 7.669594 0.8479969
+
+moiraine
also offers helper functions and target factories to facilitate the application of these integration tools. For example, the diablo_predefined_design_matrix()
function generates one of the three recommended design matrices for DIABLO (null, full or weighted full) for a given DIABLO input object, while the diablo_pairwise_pls_factory()
factory creates a list of targets to estimate the optimal design matrix to use for DIABLO based on datasets pairwise correlations estimated using PLS:
+
+Code
+
+
+list(
+ tar_target(
+ diablo_input,
+ get_input_mixomics_supervised(
+ mo_presel_supervised, # MultiDataSet object (prefiltered)
+ group = "status" # Samples covariate of interest
+ )
+ ),
+ diablo_pairwise_pls_factory(diablo_input) # Target factory for design matrix
+ # estimation
+)
+
+
+
+In addition, several plotting functions e.g. diablo_plot_tune()
to show the results of model tuning in DIABLO or so2pls_plot_summary()
, shown below, to visualise the percentage of variance explained by each latent component constructed by sO2PLS:
+
+Code
so2pls_plot_summary(so2pls_final_run)
+
+
+
+
+
+
+1.5 Intepreting the integration results
+One of the main goals of moiraine
is to facilitate the interpretation of the omics integration results. To this end, the outcome of any of the supported integration methods can be converted to a standardised integration output format, e.g.:
+
+Code
get_output(mofa_trained)
+
+
+
+#> $features_weight
+#> # A tibble: 10,245 × 5
+#> feature_id dataset latent_dimension weight importance
+#> <chr> <fct> <fct> <dbl> <dbl>
+#> 1 21-25977541-C-T-rs41974686 snps Factor 1 0.000374 0.139
+#> 2 22-51403583-A-C-rs210306176 snps Factor 1 0.0000535 0.0199
+#> 3 24-12959068-G-T-rs381471286 snps Factor 1 0.000268 0.0999
+#> 4 8-85224224-T-C-rs43565287 snps Factor 1 -0.000492 0.183
+#> 5 ARS-BFGL-BAC-16973 snps Factor 1 -0.000883 0.329
+#> 6 ARS-BFGL-BAC-19403 snps Factor 1 -0.000500 0.186
+#> 7 ARS-BFGL-BAC-2450 snps Factor 1 0.000555 0.207
+#> 8 ARS-BFGL-BAC-2600 snps Factor 1 -0.0000325 0.0121
+#> 9 ARS-BFGL-BAC-27911 snps Factor 1 -0.000568 0.212
+#> 10 ARS-BFGL-BAC-35925 snps Factor 1 0.000803 0.299
+#> # ℹ 10,235 more rows
+#>
+#> $samples_score
+#> # A tibble: 720 × 3
+#> sample_id latent_dimension score
+#> <chr> <fct> <dbl>
+#> 1 G1979 Factor 1 1.95
+#> 2 G2500 Factor 1 1.98
+#> 3 G3030 Factor 1 3.39
+#> 4 G3068 Factor 1 3.65
+#> 5 G3121 Factor 1 1.57
+#> 6 G3315 Factor 1 -1.81
+#> 7 G3473 Factor 1 -2.63
+#> 8 G3474 Factor 1 -2.30
+#> 9 G3550 Factor 1 2.98
+#> 10 G3594 Factor 1 -1.88
+#> # ℹ 710 more rows
+#>
+#> $variance_explained
+#> # A tibble: 15 × 3
+#> latent_dimension dataset prop_var_expl
+#> <fct> <fct> <dbl>
+#> 1 Factor 1 snps 0.000313
+#> 2 Factor 1 rnaseq 0.496
+#> 3 Factor 1 metabolome 0.137
+#> 4 Factor 2 snps 0.239
+#> 5 Factor 2 rnaseq 0.000197
+#> 6 Factor 2 metabolome 0.0108
+#> 7 Factor 3 snps 0.000108
+#> 8 Factor 3 rnaseq 0.199
+#> 9 Factor 3 metabolome 0.0000872
+#> 10 Factor 4 snps 0.0000927
+#> 11 Factor 4 rnaseq 0.0759
+#> 12 Factor 4 metabolome 0.0194
+#> 13 Factor 5 snps 0.000205
+#> 14 Factor 5 rnaseq 0.0198
+#> 15 Factor 5 metabolome 0.0296
+#>
+#> attr(,"class")
+#> [1] "output_dimension_reduction"
+#> attr(,"method")
+#> [1] "MOFA"
+
+This object can then be used to visualise the integration results in a number of ways, including:
+
+- percentage of variance explained:
+
+
+Code
plot_variance_explained(mofa_output)
+
+
+
+
+
+- Sample scores as pairwise scatterplots:
+
+
+Code
plot_samples_score(
+ mofa_output, # MOFA standardised output
+ latent_dimensions = paste("Factor", 1:3), # MOFA factors to display
+ mo_data = mo_set, # MultiDataSet object
+ colour_upper = "status", # Sample covariate
+ scale_colour_upper = scale_colour_brewer(palette = "Set1"), # Custom palette
+ shape_upper = "gender", # Sample covariate
+ colour_lower = "geno_comp_cluster" # Sample covariate
+) +
+ theme(legend.box = "vertical")
+
+
+
+
+
+- Sample scores against samples covariate of interest (either categorical or continuous):
+
+
+Code
plot_samples_score_covariate(
+ mofa_output, # MOFA standardised output
+ mo_set, # MultiDataSet object
+ "status", # Sample covariate of interest
+ colour_by = "status", # Other sample covariate
+ shape_by = "geno_comp_cluster", # Other sample covariate
+ latent_dimensions = paste("Factor", 1:2) # MOFA factors to display
+)
+#> Warning: Removed 4 rows containing missing values (`geom_point()`).
+
+
+
+
+
+- Top contributing features:
+
+
+Code
plot_top_features(
+ mofa_output, # MOFA standardised output
+ mo_data = mo_set, # MultiDataSet object
+ label_cols = list( # Custom labels for features from
+ "rnaseq" = "Name", # features metadata
+ "metabolome" = "name"
+ ),
+ latent_dimensions = paste("Factor", 1:2) # MOFA factors to display
+)
+
+
+
+
+
+1.6 Evaluating the integration results
+
+1.7 Comparison different integration results