-
Notifications
You must be signed in to change notification settings - Fork 0
/
immuneml_train_repert.xml
234 lines (187 loc) · 17.1 KB
/
immuneml_train_repert.xml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
<tool id="novice_immuneml_interface" name="Train immune repertoire classifiers (simplified interface)" version="@[email protected]">
<description></description>
<macros>
<import>prod_macros.xml</import>
</macros>
<expand macro="requirements" />
<command><![CDATA[
#if $iml_input
cp -r ${iml_input.extra_files_path}/result/* . &&
(mv repertoires/* . &>/dev/null || :) &&
rm -rf repertoires &&
#end if
python '$__tool_directory__/build_yaml_from_arguments_wrapper.py' --output_path $specs.files_path
#if $labels
--labels "$labels"
#end if
#if $ml_methods
#set methods_splitted = str($ml_methods).replace(",", " ")
--ml_methods $methods_splitted
#end if
#if $training_percentage
--training_percentage $training_percentage
#end if
#if $split_count
--split_count $split_count
#end if
#if $sequence_cond.sequence_type
--sequence_type $sequence_cond.sequence_type
#end if
#if $sequence_cond.sequence_type == "subsequence"
--position_type $sequence_cond.position_type
--gap_type $sequence_cond.gap_cond.gap_type
#if $sequence_cond.gap_cond.gap_type == "ungapped"
--k $sequence_cond.gap_cond.k
#end if
#if $sequence_cond.gap_cond.gap_type == "gapped"
--k_left $sequence_cond.gap_cond.k_left
--k_right $sequence_cond.gap_cond.k_right
--min_gap $sequence_cond.gap_cond.min_gap
--max_gap $sequence_cond.gap_cond.max_gap
#end if
#end if
#if $reads
--reads $reads
#end if
&& cp ${specs.files_path}/specs.yaml yaml_copy &&
immune-ml ./yaml_copy ${html_outfile.files_path} --tool GalaxyTrainMLModel
&& mv ${html_outfile.files_path}/index.html ${html_outfile}
&& mv ${specs.files_path}/specs.yaml ${specs}
&& mv ${html_outfile.files_path}/immuneML_output.zip $archive
&& mv ${html_outfile.files_path}/exported_models/*.zip ${optimal_model}
]]>
</command>
<inputs>
<param name="iml_input" type="data" format="immuneml_receptors" label="immuneML dataset (immune repertoires)" help="Here you can select an ImmuneML dataset containing a repertoire dataset, as produced by the ‘Create dataset’ tool. Please make sure your dataset contains enough repertoires, we recommend using at least 50. The minimum number of repertoires needed to run this tool successfully is 14 (for example: 7 diseased and 7 healthy). More repertoires are needed if your dataset is imbalanced (many more diseased or many more healthy), or if you decrease the percentage of data that is used for training. "/>
<param type="text" name="labels" optional="false" label="Which property (“label”) of the repertoires would you like to predict?" help="Repertoire property to predict could for example be a disease status. This property must be present as a label in the repertoire metadata."/>
<conditional name="sequence_cond">
<param type="select" name="sequence_type" label="I assume that the true class of a repertoire (for example: disease status) can be determined based on the presence of..." display="radio" help="See 'Encoding' in the tool description.">
<option value="subsequence">Similar (but not identical) CDR3 sequences, or identical subsequences</option>
<option value="complete">Complete and identical receptor sequences</option>
</param>
<when value="subsequence">
<param type="boolean" name="position_type" label="If the same CDR3 subsequence occurs in a different position in two receptors, is this expected to be the same signal? "
truevalue="invariant" falsevalue="positional" checked="true"/>
<conditional name="gap_cond">
<param type="select" name="gap_type" label="The signal is expected to correspond to:" display="radio">
<option value="ungapped">Contiguous subsequences of amino acids</option>
<option value="gapped">Subsequences of amino acids separated by a gap</option>
</param>
<when value="ungapped">
<param type="integer" name="k" label="Given a contiguous subsequence of amino acids containing a signal, the expected length of this subsequence is:" value="3" min="0"/>
</when>
<when value="gapped">
<param type="integer" name="k_left" label="Given a gapped signal, the sequence length before the gap is:" value="2" min="0"/>
<param type="integer" name="k_right" label="And the sequence length after the gap is:" value="2" min="0"/>
<param type="integer" name="min_gap" label="While the minimal gap length is:" value="0" min="0"/>
<param type="integer" name="max_gap" label="And the maximal gap length is:" value="5" min="0"/>
</when>
</conditional>
</when>
</conditional>
<param type="select" name="reads" label="I assume that" display="radio" help="If only the presence/absence of a clonotype matters, the read frequency (‘count’) information is ignored. Otherwise, the importance of a sequence or subsequence is scaled by its read frequency, and large clonotypes will have more influence on the ML model and its results.">
<option value="unique">Only the presence/absence of a clone matters</option>
<option value="all">The frequency of a clone matters</option>
</param>
<param type="select" name="ml_methods" label="Which ML methods would you like to include?" help="For each ML method, the optimal hyper parameter settings are determined and the performance of the methods is compared to each other."
display="checkboxes" multiple="true">
<option value="RandomForestClassifier">Random forest</option>
<option value="SimpleLogisticRegression">Logistic regression</option>
<option value="SVM">Support Vector Machine</option>
<option value="KNN">K-nearest neighbors</option>
</param>
<param type="integer" name="training_percentage" label="Percentage of data that is used for training + validation (the remainder is used for testing):" value="70" min="50" max="90" help="This part of the data is used for training the classifier i.e., learning the relevant patterns in the data and determining the optimal hyper parameter settings for the classifier. The remaining data is used to test the performance of the classifier. There is no golden rule that determines the optimal percentage of training data, but typically a value between 60 and 80% is chosen."/>
<param type="integer" name="split_count" label="Number of times to repeat the training process with different random splits of data:" value="5" min="0" help="This is the number of times we split into random sets for training + validation and testing. The more often the experiment is repeated, the better the performance of the ML models can be estimated, but the longer it will take for the analysis to complete. "/>
</inputs>
<outputs>
<data format="txt" name="specs" label="repertoire_classification.yaml"/>
<data format="zip" name="optimal_model" label="optimal_ml_settings.zip"/>
<data format="zip" name="archive" label="Archive: repertoire classification"/>
<data format="html" name="html_outfile" label="Summary: repertoire classification"/>
</outputs>
<help><![CDATA[
The purpose of this tool is to train machine learning (ML) models to predict a characteristic per immune repertoire, such as
a disease status. One or more ML models are trained to classify repertoires based on the information within the sets of CDR3 sequences. Finally, the performance
of the different methods is compared.
Alternatively, if you want to predict a property per immune receptor, such as antigen specificity, check out the
`Train immune receptor classifiers (simplified interface) <root?tool_id=immuneml_train_classifiers>`_ tool instead.
The full documentation can be found `here <https://docs.immuneml.uio.no/latest/galaxy/galaxy_simple_repertoires.html>`_.
**Basic terminology**
In the context of ML, the characteristics to predict per repertoire are called **labels** and the values that these labels can take on are **classes**.
One could thus have a label named ‘CMV_status’ with possible classes ‘positive’ and ‘negative’. The labels and classes must be present in the metadata
file, in columns where the header and values correspond to the label and classes respectively.
.. image:: https://docs.immuneml.uio.no/latest/_images/metadata_repertoire_classification.png
:height: 150
|
When training an ML model, the goal is for the model to learn **signals** within the data which discriminate between the different classes. An ML model
that predicts classes is also referred to as a **classifier**. A signal can have a variety of definitions, including the presence of specific receptors,
groups of similar receptors or short CDR3 subsequences in an immune repertoire. Our assumptions about what makes up a ‘signal’ determines how we
should represent our data to the ML model. This representation is called **encoding**. In this tool, the encoding is automatically chosen based on
the user's assumptions about the dataset.
.. image:: https://docs.immuneml.uio.no/latest/_images/repertoire_classification_overview.png
:height: 500
|
|
**An overview of the components of the immuneML repertoire classification tool.**
immuneML reads in repertoire data with labels (+ and -), encodes the
data, trains user-specified ML models and summarizes the performance statistics per ML method.
Encoding: different forms of encoding are shown; full sequence encoding and position dependent and invariant subsequence encoding.
The disease-associated sequences or sub-sequences are highlighted with color. The different colors represent independent elements of the disease signal.
Each color represents one (sub)sequence, and position dependent subsequences can only have the same color when they occur in the same position,
although different colors (i.e., nucleotide or amino acid sequences) may occur in the same position.
Training: the training and validation data is used to train ML models and find the optimal hyperparameters through 5-fold cross-validation.
The test set is left out and is used to obtain a fair estimate of the model performance.
**Encoding**
The simplest encoding represents an immune repertoire based on the full CDR3 sequences that it contains. This means the ML models will learn to look
at which CDR3 sequences are more often present in the ‘positive’ or ‘negative’ classes. It also means that two similar (non-identical) CDR3 sequences
are treated as independent pieces of information; if a particular sequence often occurs in diseased repertoires, then finding a similar sequence in a
new repertoire is no evidence for this repertoire also being diseased.
Other encoding variants are based on shorter subsequences (e.g., 3 – 5 amino acids long, also referred to as k-mers) in the CDR3 regions of an immune repertoire. With this
encoding, the CDR3 regions are divided into overlapping subsequences and the (disease) signal may be characterized by the presence or absence of
certain sequence motifs in the CDR3 regions. Here, two similar CDR3 sequences are no longer independent, because they contain many identical subsequences.
A graphical representation of how a CDR3 sequence can be divided into k-mers, and how these k-mers can relate to specific positions in a 3D immune receptor
(here: antibody) is shown in this figure:
.. image:: https://docs.immuneml.uio.no/latest/_images/3mer_to_3d.png
:height: 250
|
The subsequences may be position-dependent or invariant. Position invariant means that if a subsequence, e.g., ‘EDNA’ occurs in different positions
in the CDR3 it will still be considered the same signal. This is not the case for position dependent subsequences, if ‘EDNA’ often occurs in the
beginning of the CDR3 in diseased repertoires, then finding ‘EDNA’ in the end of a CDR3 in a new repertoire will be considered unrelated. Positions
are determined based on the IMGT numbering scheme.
Finally, it is possible to introduce gaps in the encoding of subsequences (not shown in the Figure). In this case, a motif is defined by two
subsequences separated by a region of varying nucleotide or amino acid length. Thus, the subsequences ‘EDNA’, ‘EDGNA’ and ‘EDGAGAGNA’ may all be
considered to be part of the same motif: ‘ED’ followed by ‘NA’ with a gap of 0 – 5 amino acids in between.
Note that in any case, the (sub)sequences that are associated with the ‘positive’ class may still be present in the ‘negative’ class, albeit at a lower rate.
**Training a machine learning model**
Training an ML model means optimizing the **parameters** for the model with the goal of predicting the correct class of an (unseen) immune repertoire.
Different ML methods require different procedures for training. In addition to the model parameters there are the **hyperparameters**, which
do not directly change the predictions of a model, but they control the learning process (for example: the learning speed).
The immune repertoires are divided into sets with different purposes: the training and validation sets are used for finding the optimal parameters
and hyperparameters respectively. The test set is held out, and is only used to estimate the performance of a trained model.
In this tool, a range of plausible hyperparameters have been predefined for each ML method. The optimal hyperparameters are found by splitting the
training/validation data into 5 equal portions, where 4 portions are used to train the ML model (with different hyperparameters) and the remaining
portion is used to validate the performance of these hyperparameter settings. This is repeated 5 times such that each portion has been used for
validation once. With the best hyperparameters found in the 5 repetitions, a final model is trained using all 5 portions of the data. This procedure
is also referred to as 5-fold cross-validation. Note that this 5-fold cross-validation is separate from the number of times the splitting into
training + validation and testing sets is done (see the overview figure).
Finally, the whole process is repeated one or more times with different randomly selected repertoires in the test set, to see how robust the performance
of the ML methods is. The number of times to repeat this splitting into training + validation and test sets is determined in the last question.
**Tool output**
This Galaxy tool will produce the following history elements:
- Summary: repertoire classification: a HTML page that allows you to browse through all results, including prediction accuracies on
the various data splits and plots showing the performance of classifiers and learned parameters.
- Archive: repertoire classification: a .zip file containing the complete output folder as it was produced by immuneML. This folder
contains the output of the TrainMLModel instruction including all trained models and their predictions, and report results.
Furthermore, the folder contains the complete YAML specification file for the immuneML run, the HTML output and a log file.
- optimal_ml_settings.zip: a .zip file containing the raw files for the optimal trained ML settings (ML model, encoding). This .zip file can subsequently be used as an input when applying previously trained ML models to a new dataset. Currently, this can only be done locally using the command-line interface.
- repertoire_classification.yaml: the YAML specification file that was used by immuneML internally to run the analysis. This file can be
downloaded, altered, and run again by immuneML using the `Train machine learning models <root?tool_id=immuneml_train_ml_model>`_ Galaxy tool.
**More analysis options**
A limited selection of immuneML options is available through this tool. If you wish to have full control of the analysis, consider using
the `Train machine learning models <root?tool_id=immuneml_train_ml_model>`_ Galaxy tool.
This tool provides other encodings and machine learning methods to choose from, as well as
data preprocessing and settings for hyperparameter optimization. The interface of the YAML-based tool expects more independence and knowledge about
machine learning from the user.
]]>
</help>
</tool>