AminoAcidFrequencyDistribution report updates: fixed bug in displayed…

… IMGT numbering, added center-alignment option (new default).
uio-bmi · Oct 10, 2024 · c371f1e · c371f1e
1 parent c2eda2d
commit c371f1e
Show file tree

Hide file tree

Showing 3 changed files with 44 additions and 14 deletions.
diff --git a/immuneML/config/default_params/reports/amino_acid_frequency_distribution_params.yaml b/immuneML/config/default_params/reports/amino_acid_frequency_distribution_params.yaml
@@ -1,4 +1,4 @@
-imgt_positions: True
-relative_frequency: True
+alignment: CENTER
+relative_frequency: False
 split_by_label: False
 label: null
diff --git a/immuneML/reports/data_reports/AminoAcidFrequencyDistribution.py b/immuneML/reports/data_reports/AminoAcidFrequencyDistribution.py
@@ -28,12 +28,25 @@ class AminoAcidFrequencyDistribution(DataReport):
 
     **Specification arguments:**
 
-    - imgt_positions (bool): Whether to use IMGT positional numbering or sequence index numbering. When imgt_positions
-      is True, IMGT positions are used, meaning sequences of unequal length are aligned according to their IMGT
-      positions. By default, imgt_positions is True.
+    - alignment (str): Alignment style for aligning sequences of different lengths. Options are as follows:
+
+      - CENTER: center-align sequences of different lengths. The middle amino acid of any sequence be labelled position 0. By default, alignment is CENTER.
+
+      - LEFT: left-align sequences of different lengths, starting at 0.
+
+      - RIGHT: right align sequences of different lengths, ending at 0 (counting towards negative numbers).
+
+      - IMGT: align sequences based on their IMGT positional numbering, considering the sequence region_type (IMGT_CDR3 or IMGT_JUNCTION).
+        The main difference between CENTER and IMGT is that IMGT aligns the first and last amino acids, adding gaps in the middle,
+        whereas CENTER aligns the middle of the sequences, padding with gaps at the start and end of the sequence.
+        When region_type is IMGT_JUNCTION, the IMGT positions run from 104 (conserved C) to 118 (conserved W/F). When IMGT_CDR3 is used, these positions are 105 to 117.
+        For long CDR3 sequences, additional numbers are added in between IMGT positions 111 and 112.
+        See the official IMGT documentation for more details: https://www.imgt.org/IMGTScientificChart/Numbering/CDR3-IMGTgaps.html
 
     - relative_frequency (bool): Whether to plot relative frequencies (true) or absolute counts (false) of the
-      positional amino acids. By default, relative_frequency is True.
+      positional amino acids. Note that when sequences are of different length, setting relative_frequency to True will
+      produce different results depending on the alignment type, as some positions are only covered by the longest sequences.
+      By default, relative_frequency is False.
 
     - split_by_label (bool): Whether to split the plots by a label. If set to true, the Dataset must either contain a
       single label, or alternatively the label of interest can be specified under 'label'. If split_by_label is set to
@@ -60,7 +73,12 @@ class AminoAcidFrequencyDistribution(DataReport):
     @classmethod
     def build_object(cls, **kwargs):
         location = AminoAcidFrequencyDistribution.__name__
-        ParameterValidator.assert_type_and_value(kwargs["imgt_positions"], bool, location, "imgt_positions")
+
+        if "imgt_positions" in kwargs:
+            raise ValueError(f"{location}: parameter 'imgt_positions' is deprecated. For 'imgt_positions: True', use 'alignment: IMGT'. For 'imgt_positions: False', use any other alignment option (CENTER/LEFT/RIGHT).")
+
+        ParameterValidator.assert_type_and_value(kwargs["alignment"], str, location, "alignment")
+        ParameterValidator.assert_in_valid_list(kwargs["alignment"].upper(), ["IMGT", "LEFT", "RIGHT", "CENTER"], location, "alignment")
         ParameterValidator.assert_type_and_value(kwargs["relative_frequency"], bool, location, "relative_frequency")
         ParameterValidator.assert_type_and_value(kwargs["split_by_label"], bool, location, "split_by_label")
 
@@ -73,11 +91,11 @@ def build_object(cls, **kwargs):
 
         return AminoAcidFrequencyDistribution(**kwargs)
 
-    def __init__(self, dataset: Dataset = None, imgt_positions: bool = None, relative_frequency: bool = None,
+    def __init__(self, dataset: Dataset = None, alignment: bool = None, relative_frequency: bool = None,
                  split_by_label: bool = None, label: str = None,
                  result_path: Path = None, number_of_processes: int = 1, name: str = None):
         super().__init__(dataset=dataset, result_path=result_path, number_of_processes=number_of_processes, name=name)
-        self.imgt_positions = imgt_positions
+        self.alignment = alignment
         self.relative_frequency = relative_frequency
         self.split_by_label = split_by_label
         self.label_name = label
@@ -229,12 +247,17 @@ def _count_dict_to_df(self, raw_count_dict):
         return pd.DataFrame(df_dict)
 
     def _get_positions(self, sequence: ReceptorSequence):
-        if self.imgt_positions:
+        if self.alignment == "IMGT":
             positions = PositionHelper.gen_imgt_positions_from_length(
                 len(sequence.get_sequence(SequenceType.AMINO_ACID)),
                 sequence.get_attribute("region_type"))
-        else:
+        elif self.alignment == "LEFT":
             positions = list(range(1, len(sequence.get_sequence(SequenceType.AMINO_ACID)) + 1))
+        elif self.alignment == "RIGHT":
+            positions = list(range(-len(sequence.get_sequence(SequenceType.AMINO_ACID))+2, 1))
+        else: # self.alignment == "CENTER
+            positions = list(range(1, len(sequence.get_sequence(SequenceType.AMINO_ACID)) + 1))
+            positions = [pos - round(max(positions) / 2) for pos in positions]
 
         return [str(pos) for pos in positions]
 
@@ -256,7 +279,7 @@ def _plot_distribution(self, freq_dist):
                         facet_row="chain" if "chain" in freq_dist.columns else None,
                         color_discrete_map=PlotlyUtil.get_amino_acid_color_map(),
                         category_orders=category_orders,
-                        labels={"position": "IMGT position" if self.imgt_positions else "Position",
+                        labels={"position": "IMGT position" if self.alignment=="IMGT" else "Position",
                                 "count": "Count",
                                 "relative frequency": "Relative frequency",
                                 "amino acid": "Amino acid"}, template="plotly_white")
@@ -272,7 +295,13 @@ def _plot_distribution(self, freq_dist):
         return ReportOutput(path=file_path, name="Amino acid frequency distribution")
 
     def _get_position_order(self, positions):
-        return [str(int(pos)) if pos.is_integer() else str(pos) for pos in sorted(set(positions.astype(float)))]
+        if self.alignment == "IMGT":
+            if min(positions) == "105" and max(positions) == "117":
+                return PositionHelper.gen_imgt_positions_from_cdr3_length(len(set(positions)))
+            elif min(positions) == "104" and max(positions) == "118":
+                return ["104"] + PositionHelper.gen_imgt_positions_from_cdr3_length(len(set(positions))-2) + ["118"]
+        else:
+            return [str(pos) for pos in sorted(set(positions.astype(int)))]
 
     def _compute_frequency_change(self, freq_dist):
         classes = sorted(set(freq_dist["class"]))
@@ -307,7 +336,7 @@ def _plot_frequency_change(self, frequency_change):
                         facet_col="positive_class",
                         facet_row="chain" if "chain" in frequency_change.columns else None,
                         color_discrete_map=PlotlyUtil.get_amino_acid_color_map(),
-                        labels={"position": "IMGT position" if self.imgt_positions else "Position",
+                        labels={"position": "IMGT position" if self.alignment=="IMGT" else "Position",
                                 "positive_class": "Class",
                                 "frequency_change": "Difference in relative frequency",
                                 "amino acid": "Amino acid"}, template="plotly_white")

diff --git a/test/reports/data_reports/test_AminoAcidFrequencyDistribution.py b/test/reports/data_reports/test_AminoAcidFrequencyDistribution.py
@@ -22,6 +22,7 @@ def test_generate_sequence_dataset(self):
         params["dataset"] = dataset
         params["split_by_label"] = True
         params["result_path"] = path / "result"
+        params["alignment"] = "IMGT"
 
         report = AminoAcidFrequencyDistribution.build_object(**params)
         self.assertTrue(report.check_prerequisites())