Merge pull request #1206 from griffithlab/mutation_position

Fix Mutation Position column
griffithlab · Feb 19, 2025 · 7d2a69d · 7d2a69d
2 parents 7dc4386 + cdaf3a0
commit 7d2a69d
Show file tree

Hide file tree

Showing 64 changed files with 13,029 additions and 15,636 deletions.
diff --git a/docs/pvacseq/output_files.rst b/docs/pvacseq/output_files.rst
@@ -165,7 +165,7 @@ all_epitopes.tsv and filtered.tsv Report Columns
    * - ``Sub-peptide Position``
      - The one-based position of the epitope within the protein sequence used to make the prediction
    * - ``Mutation Position``
-     - The one-based positional range (inclusive) of the mutation within the epitope sequence. If the mutation is a deletion, the amino acids flanking the deletion are recorded. Note that in the case of ambiguous amino acid changes, this reflects the change that is left-aligned, starting from the first changed amino acid; this may differ from the ``Mutation`` column.
+     - A comma-separated list of all amino acid positions in the ``MT Epitope Seq`` that are different from the ``WT Epitope Seq``. ``NA`` if the ``WT Epitope Seq`` is ``NA``.
    * - ``MT Epitope Seq``
      - The mutant epitope sequence
    * - ``WT Epitope Seq``
@@ -339,8 +339,7 @@ included epitopes, selecting the best-scoring epitope, and which values are outp
    * - ``Allele``
      - The Allele that the Best Peptide is binding to
    * - ``Pos``
-     - The one-based position of the start of the mutation within the epitope sequence. ``0`` if the
-       start of the mutation is before the epitope (as can occur downstream of frameshift mutations)
+     - A comma-separated list of all amino acid positions in the ``MT Epitope Seq`` that are different from the ``WT Epitope Seq``. ``NA`` if the ``WT Epitope Seq`` is ``NA``.
    * - ``Prob Pos``
      - A list of positions in the Best Peptide that are problematic.
        ``None`` if the ``--problematic-pos`` parameter was not set during
@@ -507,7 +506,7 @@ Criteria Details
      - Best Peptide is likely in the founding clone of the tumor
      - ``DNA VAF > tumor_purity / 4``
    * - Anchor Criteria
-     - Fail if all mutated amino acids of the Best Peptide (``Pos``) are at an anchor position and the WT peptide has good binding ``(IC50 WT < binding_threshold)``
+     - Fail if if there are <= 2 mutated amino acids and all mutated amino acids of the Best Peptide (``Pos``) are at an anchor position and the WT peptide has good binding ``(IC50 WT < binding_threshold)``
      -
 
 .. _reference_matches:

diff --git a/docs/pvacsplice/output_files.rst b/docs/pvacsplice/output_files.rst
@@ -390,17 +390,6 @@ To tier the Best Peptide, several cutoffs can be adjusted using arguments provid
      - The threshold to use for filtering epitopes on the Ensembl transcript support level (TSL).
        Transcript support level needs to be <= this cutoff to be included in most tiers.
      - 1
-   * - ``--allele-specific-anchors``
-     - Use allele-specific anchor positions when tiering epitopes in the aggregate report. This option is available for 8, 9, 10, and
-       11mers and only for HLA-A, B, and C alleles. If this option is not enabled or as a fallback for unsupported lengths and alleles,
-       the default positions of [1, 2, epitope length - 1, and epitope length] are used. Please see https://doi.org/10.1101/2020.12.08.416271 for more details.
-     - False
-   * - ``--anchor-contribution-threshold``
-     - For determining allele-specific anchors, each position is assigned a score based on how binding is influenced by mutations. From these scores, the relative
-       contribution of each position to the overall binding is calculated. Starting with the highest relative contribution, positions whose score together account for the
-       selected contribution threshold are assigned as anchor locations. As a result, a higher threshold leads to the inclusion of more positions to be considered
-       anchors.
-     - 0.8
 
 Tiers
 *****
@@ -413,7 +402,7 @@ Given the thresholds provided above, the Best Peptide is evaluated and binned in
    * - Tier
      - Citeria
    * - ``Pass``
-     - Best Peptide passes the binding, expression, tsl, clonal, and anchor criteria
+     - Best Peptide passes the binding, expression, tsl, and clonal criteria
    * - ``Subclonal``
      - Best Peptide fails the clonal criteria but passes the binding, tsl, and
        anchor criteria

diff --git a/pvactools/lib/aggregate_all_epitopes.py b/pvactools/lib/aggregate_all_epitopes.py
@@ -416,26 +416,22 @@ def is_anchor_residue_pass(self, mutation):
         else:
             binding_threshold = self.binding_threshold
 
-        anchor_residue_pass = True
         anchors = get_anchor_positions(mutation['HLA Allele'], len(mutation['MT Epitope Seq']), self.allele_specific_anchors, self.anchor_probabilities, self.anchor_contribution_threshold, self.mouse_anchor_positions)
-        # parse out mutation position from str
+        # parse out mutation positions from str
         position = mutation["Mutation Position"]
         if pd.isna(position):
-            return anchor_residue_pass
-        elif '-' in position:
-            d_ind = position.index('-')
-            if all(pos in anchors for pos in range(int(position[0:d_ind]), int(position[d_ind+1:])+1)):
-                if pd.isna(mutation["{} WT IC50 Score".format(self.wt_top_score_metric)]):
-                    anchor_residue_pass = False
-                elif mutation["{} WT IC50 Score".format(self.wt_top_score_metric)] < binding_threshold:
-                    anchor_residue_pass = False
+            return True
         else:
-            if int(float(position)) in anchors:
+            positions = position.split(", ")
+            if len(positions) > 2:
+                return True
+            anchor_residue_pass = True
+            if all(int(pos) in anchors for pos in positions):
                 if pd.isna(mutation["{} WT IC50 Score".format(self.wt_top_score_metric)]):
                     anchor_residue_pass = False
                 elif mutation["{} WT IC50 Score".format(self.wt_top_score_metric)] < binding_threshold:
                     anchor_residue_pass = False
-        return anchor_residue_pass
+            return anchor_residue_pass
 
     #assign mutations to a "Classification" based on their favorability
     def get_tier(self, mutation, vaf_clonal):

diff --git a/pvactools/lib/calculate_reference_proteome_similarity.py b/pvactools/lib/calculate_reference_proteome_similarity.py
@@ -208,23 +208,29 @@ def get_wt_peptides(self):
         return records_dict
 
 
-    def extract_n_mer(self, full_peptide, subpeptide_position, mutation_position, mt_length, variant_type):
+    def extract_n_mer(self, mt_peptide, wt_peptide):
         #For non-frameshifts this ensures that we only test match_length epitopes that overlap the mutation
         #If we extract a larger region, we will get false-positive matches against the reference proteome
         #from the native wildtype portion of the peptide
         flanking_sequence_length = self.match_length - 1
-        if variant_type == 'inframe_del':
-            mt_start = subpeptide_position + (mutation_position)
+        for i in range(len(mt_peptide)):
+            if len(wt_peptide) < i:
+                first_mut_aa_pos = 0
+                break
+            if wt_peptide[i] != mt_peptide[i]:
+                first_mut_aa_pos = i
+                break
+        for i in range(len(mt_peptide)):
+            if len(wt_peptide) < i:
+                last_mut_aa_pos = len(mt_peptide)
+                break
+            if wt_peptide[i * -1] != mt_peptide[i * -1]:
+                last_mut_aa_pos = len(mt_peptide) - i
+                break
+        if last_mut_aa_pos >= first_mut_aa_pos:
+            return mt_peptide[first_mut_aa_pos-flanking_sequence_length:last_mut_aa_pos+flanking_sequence_length+1]
         else:
-            mt_start = subpeptide_position + (mutation_position-1)
-        start = mt_start - flanking_sequence_length
-        if start < 0:
-            start = 0
-        if variant_type == 'inframe_del':
-            end = mt_start + flanking_sequence_length
-        else:
-            end = mt_start + mt_length + flanking_sequence_length
-        return full_peptide[start:end]
+            return mt_peptide[last_mut_aa_pos-flanking_sequence_length:first_mut_aa_pos+flanking_sequence_length+1]
 
 
     def extract_n_mer_from_fs(self, full_peptide, wt_peptide, epitope, subpeptide_position):
@@ -273,6 +279,7 @@ def _get_peptide(self, line, mt_records_dict, wt_records_dict):
             else:
                 peptide = mt_records_dict[line['Mutation']]
             full_peptide = peptide
+            wt_peptide = None
         elif self.file_type == 'pVACsplice':
             if self._input_tsv_type(line) == 'aggregated':
                 identifier = line['ID']
@@ -282,18 +289,14 @@ def _get_peptide(self, line, mt_records_dict, wt_records_dict):
             mt_peptide = mt_records_dict[identifier]
             peptide = get_mutated_peptide_with_flanking_sequence(wt_peptide, mt_peptide, self.match_length-1)
             full_peptide = peptide
+            wt_peptide = None
         else:
             if self._input_tsv_type(line) == 'aggregated':
                 epitope = line['Best Peptide']
                 (rest_record_id, variant_type, aa_change) = line['Index'].rsplit(".", 2)
-                (_, mt_pos, wt_amino_acids, mt_amino_acids) = index_to_aggregate_report_aa_change(aa_change, variant_type)
-                mt_pos = int(line['Pos'].split('-')[0])
             else:
                 epitope = line['MT Epitope Seq']
                 variant_type = line['Variant Type']
-                if variant_type != 'FS':
-                    mt_pos = int(line['Mutation Position'].split('-')[0])
-                    (wt_amino_acids, mt_amino_acids) = line['Mutation'].split('/')
             full_peptide = mt_records_dict[line['Index']]
             wt_peptide = wt_records_dict[line['Index']]
 
@@ -302,18 +305,8 @@ def _get_peptide(self, line, mt_records_dict, wt_records_dict):
             if variant_type == 'FS':
                 peptide = self.extract_n_mer_from_fs(full_peptide, wt_peptide, epitope, subpeptide_position)
             else:
-                if mt_amino_acids == '-':
-                    mt_amino_acids = ''
-                if len(mt_amino_acids) == len(wt_amino_acids) and len(mt_amino_acids) > 1:
-                    #remove leading and trailing identical amino acids
-                    shortened_mt_amino_acids = ""
-                    for mt_aa, wt_aa in zip(mt_amino_acids, wt_amino_acids):
-                        if wt_aa != mt_aa:
-                            shortened_mt_amino_acids += mt_aa
-                else:
-                    shortened_mt_amino_acids = mt_amino_acids
-                peptide = self.extract_n_mer(full_peptide, subpeptide_position, mt_pos, len(shortened_mt_amino_acids), variant_type)
-        return peptide, full_peptide
+                peptide = self.extract_n_mer(full_peptide, wt_peptide)
+        return peptide, full_peptide, wt_peptide
 
 
     def _call_blast(self, full_peptide, p):
@@ -354,7 +347,7 @@ def _match_from_peptide_fasta(self, full_peptide):
         return results
 
 
-    def _generate_reference_match_dict_from_blast_records(self, blast_records, peptide):
+    def _generate_reference_match_dict_from_blast_records(self, blast_records, peptide, full_wt_peptide):
         reference_match_dict = []
         for blast_record in blast_records:
             if len(blast_record.alignments) > 0: # if there is at least one alignment
@@ -366,7 +359,9 @@ def _generate_reference_match_dict_from_blast_records(self, blast_records, pepti
                             for match in matches:
                                 # 'windows' of query peptides that match subject peptides
                                 windows = [match[i:i+self.match_length] for i in range(len(match)-(self.match_length-1))]
-                                for window in windows: 
+                                for window in windows:
+                                    if full_wt_peptide is not None and window in full_wt_peptide:
+                                        continue
                                     if window in peptide:
                                         reference_match_dict.append({
                                             'Hit ID': alignment.hit_id,
@@ -392,11 +387,13 @@ def _generate_reference_match_dict_from_peptide_fasta_results_for_pvacbind(self,
             })
         return self._combine_reference_match_entries(reference_match_dict)
 
-    def _generate_reference_match_dict_from_peptide_fasta_results(self, results, peptide, transcript):
+    def _generate_reference_match_dict_from_peptide_fasta_results(self, results, peptide, transcript, full_wt_peptide):
         reference_match_dict = []
         for transcript_seq, epitope, match_start in results:
             if transcript in transcript_seq.description:
                 continue
+            if full_wt_peptide is not None and epitope in full_wt_peptide:
+                continue
             reference_match_dict.append({
                 'Transcript': transcript,
                 'Hit ID': transcript_seq.id,
@@ -444,7 +441,7 @@ def _write_outputs(self, processed_peptides, mt_records_dict, wt_records_dict):
             metric_writer.writeheader()
 
             for line in reader:
-                peptide, full_peptide = self._get_peptide(line, mt_records_dict, wt_records_dict)
+                peptide, full_peptide, full_wt_peptide = self._get_peptide(line, mt_records_dict, wt_records_dict)
 
                 if self.peptide_fasta:
                     if peptide is None:
@@ -471,11 +468,11 @@ def _write_outputs(self, processed_peptides, mt_records_dict, wt_records_dict):
                             transcript = line['Transcript']
                         else:
                             transcript = line['Best Transcript']
-                        reference_matches = self._generate_reference_match_dict_from_peptide_fasta_results(results, peptide, transcript)
+                        reference_matches = self._generate_reference_match_dict_from_peptide_fasta_results(results, peptide, transcript, full_wt_peptide)
                     else:
                         reference_matches = self._generate_reference_match_dict_from_peptide_fasta_results_for_pvacbind(results, peptide)
                 else:
-                    reference_matches = self._generate_reference_match_dict_from_blast_records(results, peptide)
+                    reference_matches = self._generate_reference_match_dict_from_blast_records(results, peptide, full_wt_peptide)
 
                 if len(reference_matches) > 0:
                     if self._input_tsv_type(line) == 'aggregated':
@@ -559,7 +556,7 @@ def _get_unique_peptides(self, mt_records_dict, wt_records_dict):
         with open(self.input_file) as input_fh:
             reader = csv.DictReader(input_fh, delimiter='\t')
             for line in reader:
-                peptide, full_peptide = self._get_peptide(line, mt_records_dict, wt_records_dict)
+                peptide, full_peptide, full_wt_peptide = self._get_peptide(line, mt_records_dict, wt_records_dict)
                 if self.peptide_fasta:
                     if peptide is not None:
                         unique_peptides.add(peptide)