From 9e891568164093e62168601290ab6498b58937b6 Mon Sep 17 00:00:00 2001
From: SimonVandeVyver <simon.vandevyver@ugent.be>
Date: Fri, 13 Sep 2024 10:27:18 +0200
Subject: [PATCH 1/9] add functionality to search only tryptic matches

---
 sa-index/src/peptide_search.rs |  22 +++++--
 sa-index/src/sa_searcher.rs    | 104 ++++++++++++++++++++++++---------
 sa-server/src/main.rs          |   6 +-
 3 files changed, 98 insertions(+), 34 deletions(-)
diff --git a/sa-index/src/peptide_search.rs b/sa-index/src/peptide_search.rs
index 55d629f..b3958c7 100644
--- a/sa-index/src/peptide_search.rs
+++ b/sa-index/src/peptide_search.rs
@@ -38,6 +38,7 @@ impl From<&Protein> for ProteinInfo {
 /// * `equate_il` - Boolean indicating if we want to equate I and L during search
 /// * `clean_taxa` - Boolean indicating if we want to filter out proteins that are invalid in the
 ///   taxonomy
+/// * `tryptic` - Boolean indicating if we only want tryptic matches.
 ///
 /// # Returns
 ///
@@ -50,7 +51,8 @@ pub fn search_proteins_for_peptide<'a>(
     searcher: &'a Searcher,
     peptide: &str,
     cutoff: usize,
-    equate_il: bool
+    equate_il: bool,
+    tryptic: bool
 ) -> Option<(bool, Vec<&'a Protein>)> {
     let peptide = peptide.trim_end().to_uppercase();
 
@@ -59,7 +61,7 @@ pub fn search_proteins_for_peptide<'a>(
         return None;
     }
 
-    let suffix_search = searcher.search_matching_suffixes(peptide.as_bytes(), cutoff, equate_il);
+    let suffix_search = searcher.search_matching_suffixes(peptide.as_bytes(), cutoff, equate_il, tryptic);
     let (suffixes, cutoff_used) = match suffix_search {
         SearchAllSuffixesResult::MaxMatches(matched_suffixes) => Some((matched_suffixes, true)),
         SearchAllSuffixesResult::SearchResult(matched_suffixes) => Some((matched_suffixes, false)),
@@ -71,8 +73,14 @@ pub fn search_proteins_for_peptide<'a>(
     Some((cutoff_used, proteins))
 }
 
-pub fn search_peptide(searcher: &Searcher, peptide: &str, cutoff: usize, equate_il: bool) -> Option<SearchResult> {
-    let (cutoff_used, proteins) = search_proteins_for_peptide(searcher, peptide, cutoff, equate_il)?;
+pub fn search_peptide(
+    searcher: &Searcher,
+    peptide: &str,
+    cutoff: usize,
+    equate_il: bool,
+    tryptic: bool
+) -> Option<SearchResult> {
+    let (cutoff_used, proteins) = search_proteins_for_peptide(searcher, peptide, cutoff, equate_il, tryptic)?;
 
     Some(SearchResult {
         sequence: peptide.to_string(),
@@ -91,6 +99,7 @@ pub fn search_peptide(searcher: &Searcher, peptide: &str, cutoff: usize, equate_
 /// * `equate_il` - Boolean indicating if we want to equate I and L during search
 /// * `clean_taxa` - Boolean indicating if we want to filter out proteins that are invalid in the
 ///   taxonomy
+/// * `tryptic` - Boolean indicating if we only want tryptic matches.
 ///
 /// # Returns
 ///
@@ -99,11 +108,12 @@ pub fn search_all_peptides(
     searcher: &Searcher,
     peptides: &Vec<String>,
     cutoff: usize,
-    equate_il: bool
+    equate_il: bool,
+    tryptic: bool
 ) -> Vec<SearchResult> {
     peptides
         .par_iter()
-        .filter_map(|peptide| search_peptide(searcher, peptide, cutoff, equate_il))
+        .filter_map(|peptide| search_peptide(searcher, peptide, cutoff, equate_il, tryptic))
         .collect()
 }
 
diff --git a/sa-index/src/sa_searcher.rs b/sa-index/src/sa_searcher.rs
index d09c704..87ab494 100644
--- a/sa-index/src/sa_searcher.rs
+++ b/sa-index/src/sa_searcher.rs
@@ -1,6 +1,6 @@
 use std::{cmp::min, ops::Deref};
 
-use sa_mappings::proteins::{Protein, Proteins};
+use sa_mappings::proteins::{Protein, Proteins, SEPARATION_CHARACTER, TERMINATION_CHARACTER};
 
 use crate::{
     sa_searcher::BoundSearch::{Maximum, Minimum},
@@ -297,6 +297,7 @@ impl Searcher {
     /// * `max_matches` - The maximum amount of matches processed, if more matches are found we
     ///   don't process them
     /// * `equate_il` - True if we want to equate I and L during search, otherwise false
+    /// * `tryptic` - Boolean indicating if we only want tryptic matches.
     ///
     /// # Returns
     ///
@@ -306,7 +307,8 @@ impl Searcher {
         &self,
         search_string: &[u8],
         max_matches: usize,
-        equate_il: bool
+        equate_il: bool,
+        tryptic: bool
     ) -> SearchAllSuffixesResult {
         let mut matching_suffixes: Vec<i64> = vec![];
         let mut il_locations = vec![];
@@ -334,32 +336,41 @@ impl Searcher {
                 let mut sa_index = min_bound;
                 while sa_index < max_bound {
                     let suffix = self.sa.get(sa_index) as usize;
-                    // filter away matches where I was wrongfully equalized to L, and check the
-                    // unmatched prefix when I and L equalized, we only need to
-                    // check the prefix, not the whole match, when the prefix is 0, we don't need to
-                    // check at all
-                    if suffix >= skip
-                        && ((skip == 0
+
+                    if suffix >= skip {
+                        let match_start = suffix - skip;
+                        let match_end = suffix + search_string.len() - skip;
+
+                        // filter away matches where I was wrongfully equalized to L, and check the
+                        // unmatched prefix when I and L equalized, we only need to
+                        // check the prefix, not the whole match, when the prefix is 0, we don't need to
+                        // check at all
+                        if (skip == 0
                             || Self::check_prefix(
                                 current_search_string_prefix,
-                                &self.proteins.input_string[suffix - skip..suffix],
+                                &self.proteins.input_string[match_start..suffix],
                                 equate_il
                             ))
                             && Self::check_suffix(
                                 skip,
                                 il_locations_current_suffix,
                                 current_search_string_suffix,
-                                &self.proteins.input_string[suffix..suffix + search_string.len() - skip],
+                                &self.proteins.input_string[suffix..match_end],
                                 equate_il
-                            ))
-                    {
-                        matching_suffixes.push((suffix - skip) as i64);
-
-                        // return if max number of matches is reached
-                        if matching_suffixes.len() >= max_matches {
-                            return SearchAllSuffixesResult::MaxMatches(matching_suffixes);
+                            )
+                            && (!tryptic
+                                || ((self.check_start_of_protein(match_start) || self.check_tryptic_cut(match_start))
+                                    && (self.check_end_of_protein(match_end) || self.check_tryptic_cut(match_end))))
+                        {
+                            matching_suffixes.push((suffix - skip) as i64);
+
+                            // return if max number of matches is reached
+                            if matching_suffixes.len() >= max_matches {
+                                return SearchAllSuffixesResult::MaxMatches(matching_suffixes);
+                            }
                         }
                     }
+
                     sa_index += 1;
                 }
             }
@@ -373,6 +384,47 @@ impl Searcher {
         }
     }
 
+    /// Check if a cut is the start of a protein.
+    ///
+    /// # Arguments
+    /// * `cut_index` - The index of the cut in the text of proteins.
+    ///
+    /// # Returns
+    ///
+    /// Returns true if the cut is at the start of a protein.
+    #[inline]
+    fn check_start_of_protein(&self, cut_index: usize) -> bool {
+        cut_index == 0 || self.proteins.input_string[cut_index - 1] == SEPARATION_CHARACTER
+    }
+
+    /// Check if a cut is the end of a protein.
+    ///
+    /// # Arguments
+    /// * `cut_index` - The index of the cut in the text of proteins.
+    ///
+    /// # Returns
+    ///
+    /// Returns true if the cut is at the end of a protein.
+    #[inline]
+    fn check_end_of_protein(&self, cut_index: usize) -> bool {
+        self.proteins.input_string[cut_index] == TERMINATION_CHARACTER
+            || self.proteins.input_string[cut_index] == SEPARATION_CHARACTER
+    }
+
+    /// Check if a cut is a tryptic cut, so check if the amino acid preceding the cut is K or R and the amino acid at the cut is not P.
+    ///
+    /// # Arguments
+    /// * `cut_index` - The index of the cut in the text of proteins.
+    ///
+    /// # Returns
+    ///
+    /// Returns true if the cut is a tryptic cut.
+    #[inline]
+    fn check_tryptic_cut(&self, cut_index: usize) -> bool {
+        (self.proteins.input_string[cut_index - 1] == b'K' || self.proteins.input_string[cut_index - 1] == b'R')
+            && self.proteins.input_string[cut_index] != b'P'
+    }
+
     /// Returns true of the prefixes are the same
     /// if `equate_il` is set to true, L and I are considered the same
     ///
@@ -545,11 +597,11 @@ mod tests {
         let searcher = Searcher::new(sa, proteins, Box::new(suffix_index_to_protein));
 
         // search suffix 'VAA'
-        let found_suffixes = searcher.search_matching_suffixes(&[b'V', b'A', b'A'], usize::MAX, false);
+        let found_suffixes = searcher.search_matching_suffixes(&[b'V', b'A', b'A'], usize::MAX, false, false);
         assert_eq!(found_suffixes, SearchAllSuffixesResult::SearchResult(vec![7]));
 
         // search suffix 'AC'
-        let found_suffixes = searcher.search_matching_suffixes(&[b'A', b'C'], usize::MAX, false);
+        let found_suffixes = searcher.search_matching_suffixes(&[b'A', b'C'], usize::MAX, false, false);
         assert_eq!(found_suffixes, SearchAllSuffixesResult::SearchResult(vec![5, 11]));
     }
 
@@ -578,11 +630,11 @@ mod tests {
         let searcher = Searcher::new(sa, proteins, Box::new(suffix_index_to_protein));
 
         // search bounds 'RIZ' with equal I and L
-        let found_suffixes = searcher.search_matching_suffixes(&[b'R', b'I', b'Z'], usize::MAX, true);
+        let found_suffixes = searcher.search_matching_suffixes(&[b'R', b'I', b'Z'], usize::MAX, true, false);
         assert_eq!(found_suffixes, SearchAllSuffixesResult::SearchResult(vec![16]));
 
         // search bounds 'RIZ' without equal I and L
-        let found_suffixes = searcher.search_matching_suffixes(&[b'R', b'I', b'Z'], usize::MAX, false);
+        let found_suffixes = searcher.search_matching_suffixes(&[b'R', b'I', b'Z'], usize::MAX, false, false);
         assert_eq!(found_suffixes, SearchAllSuffixesResult::NoMatches);
     }
 
@@ -605,7 +657,7 @@ mod tests {
         let searcher = Searcher::new(sparse_sa, proteins, Box::new(suffix_index_to_protein));
 
         // search bounds 'IM' with equal I and L
-        let found_suffixes = searcher.search_matching_suffixes(&[b'I', b'M'], usize::MAX, true);
+        let found_suffixes = searcher.search_matching_suffixes(&[b'I', b'M'], usize::MAX, true, false);
         assert_eq!(found_suffixes, SearchAllSuffixesResult::SearchResult(vec![0]));
     }
 
@@ -626,7 +678,7 @@ mod tests {
         let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.input_string);
         let searcher = Searcher::new(sparse_sa, proteins, Box::new(suffix_index_to_protein));
 
-        let found_suffixes = searcher.search_matching_suffixes(&[b'I'], usize::MAX, true);
+        let found_suffixes = searcher.search_matching_suffixes(&[b'I'], usize::MAX, true, false);
         assert_eq!(found_suffixes, SearchAllSuffixesResult::SearchResult(vec![2, 3, 4, 5]));
     }
 
@@ -647,7 +699,7 @@ mod tests {
         let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.input_string);
         let searcher = Searcher::new(sparse_sa, proteins, Box::new(suffix_index_to_protein));
 
-        let found_suffixes = searcher.search_matching_suffixes(&[b'I', b'I'], usize::MAX, true);
+        let found_suffixes = searcher.search_matching_suffixes(&[b'I', b'I'], usize::MAX, true, false);
         assert_eq!(found_suffixes, SearchAllSuffixesResult::SearchResult(vec![0, 1, 2, 3, 4]));
     }
 
@@ -670,7 +722,7 @@ mod tests {
 
         // search all places where II is in the string IIIILL, but with a sparse SA
         // this way we check if filtering the suffixes works as expected
-        let found_suffixes = searcher.search_matching_suffixes(&[b'I', b'I'], usize::MAX, false);
+        let found_suffixes = searcher.search_matching_suffixes(&[b'I', b'I'], usize::MAX, false, false);
         assert_eq!(found_suffixes, SearchAllSuffixesResult::SearchResult(vec![0, 1, 2]));
     }
 
@@ -692,7 +744,7 @@ mod tests {
         let searcher = Searcher::new(sparse_sa, proteins, Box::new(suffix_index_to_protein));
 
         // search bounds 'IM' with equal I and L
-        let found_suffixes = searcher.search_matching_suffixes(&[b'I', b'I'], usize::MAX, true);
+        let found_suffixes = searcher.search_matching_suffixes(&[b'I', b'I'], usize::MAX, true, false);
         assert_eq!(found_suffixes, SearchAllSuffixesResult::SearchResult(vec![0, 1, 2, 3, 4]));
     }
 }
diff --git a/sa-server/src/main.rs b/sa-server/src/main.rs
index 5284546..1a1cedf 100644
--- a/sa-server/src/main.rs
+++ b/sa-server/src/main.rs
@@ -58,7 +58,9 @@ struct InputData {
     cutoff: usize,
     #[serde(default = "bool::default")]
     // default value is false // TODO: maybe default should be true?
-    equate_il: bool
+    equate_il: bool,
+    #[serde(default = "bool::default")] // default false
+    tryptic: bool
 }
 
 #[tokio::main]
@@ -83,7 +85,7 @@ async fn search(
     State(searcher): State<Arc<SparseSearcher>>,
     data: Json<InputData>
 ) -> Result<Json<Vec<SearchResult>>, StatusCode> {
-    let search_result = search_all_peptides(&searcher, &data.peptides, data.cutoff, data.equate_il);
+    let search_result = search_all_peptides(&searcher, &data.peptides, data.cutoff, data.equate_il, data.tryptic);
 
     Ok(Json(search_result))
 }

From fa13d60c4355c1c81e86576612177b1c7b59e833 Mon Sep 17 00:00:00 2001
From: SimonVandeVyver <simon.vandevyver@ugent.be>
Date: Fri, 13 Sep 2024 10:45:59 +0200
Subject: [PATCH 2/9] Add test for tryptic search

---
 sa-index/src/sa_searcher.rs | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/sa-index/src/sa_searcher.rs b/sa-index/src/sa_searcher.rs
index 87ab494..0be7d17 100644
--- a/sa-index/src/sa_searcher.rs
+++ b/sa-index/src/sa_searcher.rs
@@ -747,4 +747,28 @@ mod tests {
         let found_suffixes = searcher.search_matching_suffixes(&[b'I', b'I'], usize::MAX, true, false);
         assert_eq!(found_suffixes, SearchAllSuffixesResult::SearchResult(vec![0, 1, 2, 3, 4]));
     }
+
+    #[test]
+    fn test_tryptic_search() {
+        let text = "PAA-AAKPKAPAA$".to_string().into_bytes();
+
+        let proteins = Proteins {
+            input_string: text,
+            proteins: vec![Protein {
+                uniprot_id: String::new(),
+                taxon_id: 0,
+                functional_annotations: vec![]
+            }]
+        };
+
+        let sparse_sa = SuffixArray::Original(vec![13, 3, 12, 11, 1, 4, 2, 5, 9, 8, 6, 10, 0, 7], 1);
+        let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.input_string);
+        let searcher = Searcher::new(sparse_sa, proteins, Box::new(suffix_index_to_protein));
+
+        let found_suffixes_1 = searcher.search_matching_suffixes(&[b'P', b'A', b'A'], usize::MAX, false, true);
+        assert_eq!(found_suffixes_1, SearchAllSuffixesResult::SearchResult(vec![0]));
+
+        let found_suffixes_2 = searcher.search_matching_suffixes(&[b'A', b'P', b'A', b'A'], usize::MAX, false, true);
+        assert_eq!(found_suffixes_2, SearchAllSuffixesResult::SearchResult(vec![9]));
+    }
 }

From 757426ea4a109d5319f0eb77ee6cedfa3ce76504 Mon Sep 17 00:00:00 2001
From: Pieter Verschaffelt <pieter.verschaffelt@ugent.be>
Date: Thu, 19 Sep 2024 11:17:51 +0200
Subject: [PATCH 3/9] Run formatter

---
 sa-index/src/sa_searcher.rs | 30 ++++++++++++++----------------
 1 file changed, 14 insertions(+), 16 deletions(-)

diff --git a/sa-index/src/sa_searcher.rs b/sa-index/src/sa_searcher.rs
index 2687de0..dab8577 100644
--- a/sa-index/src/sa_searcher.rs
+++ b/sa-index/src/sa_searcher.rs
@@ -179,10 +179,8 @@ impl Searcher {
         while index_in_search_string < search_string.len()
             && index_in_suffix < self.proteins.text.len()
             && (search_string[index_in_search_string] == self.proteins.text.get(index_in_suffix)
-                || (search_string[index_in_search_string] == b'L'
-                    && self.proteins.text.get(index_in_suffix) == b'I')
-                || (search_string[index_in_search_string] == b'I'
-                    && self.proteins.text.get(index_in_suffix) == b'L'))
+                || (search_string[index_in_search_string] == b'L' && self.proteins.text.get(index_in_suffix) == b'I')
+                || (search_string[index_in_search_string] == b'I' && self.proteins.text.get(index_in_suffix) == b'L'))
         {
             index_in_suffix += 1;
             index_in_search_string += 1;
@@ -348,20 +346,20 @@ impl Searcher {
                         // check at all
                         if (skip == 0
                             || Self::check_prefix(
-                            current_search_string_prefix,
-                            ProteinTextSlice::new(&self.proteins.text, match_start, suffix),
-                            equate_il
-                        ))
+                                current_search_string_prefix,
+                                ProteinTextSlice::new(&self.proteins.text, match_start, suffix),
+                                equate_il
+                            ))
                             && Self::check_suffix(
-                            skip,
-                            il_locations_current_suffix,
-                            current_search_string_suffix,
-                            ProteinTextSlice::new(&self.proteins.text, suffix, match_end),
-                            equate_il
-                        )
+                                skip,
+                                il_locations_current_suffix,
+                                current_search_string_suffix,
+                                ProteinTextSlice::new(&self.proteins.text, suffix, match_end),
+                                equate_il
+                            )
                             && (!tryptic
-                            || ((self.check_start_of_protein(match_start) || self.check_tryptic_cut(match_start))
-                            && (self.check_end_of_protein(match_end) || self.check_tryptic_cut(match_end))))
+                                || ((self.check_start_of_protein(match_start) || self.check_tryptic_cut(match_start))
+                                    && (self.check_end_of_protein(match_end) || self.check_tryptic_cut(match_end))))
                         {
                             matching_suffixes.push((suffix - skip) as i64);
 

From e3cc70fa5c5cdd91d21287a8d7b2e2897979c34e Mon Sep 17 00:00:00 2001
From: Pieter Verschaffelt <pieter.verschaffelt@ugent.be>
Date: Thu, 19 Sep 2024 11:47:37 +0200
Subject: [PATCH 4/9] Add missing character to error messages

---
 text-compression/src/lib.rs | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/text-compression/src/lib.rs b/text-compression/src/lib.rs
index 4866a6c..d56d4b4 100644
--- a/text-compression/src/lib.rs
+++ b/text-compression/src/lib.rs
@@ -58,7 +58,7 @@ impl ProteinText {
 
         let mut bit_array = BitArray::with_capacity(input_string.len(), 5);
         for (i, c) in input_string.chars().enumerate() {
-            let char_5bit: u8 = *char_to_5bit.get(&(c as u8)).expect("Input character not in alphabet");
+            let char_5bit: u8 = *char_to_5bit.get(&(c as u8)).expect(&format!("Input character '{}' not in alphabet", c));
             bit_array.set(i, char_5bit as u64);
         }
 
@@ -79,7 +79,7 @@ impl ProteinText {
 
         let mut bit_array = BitArray::with_capacity(input_vec.len(), 5);
         for (i, e) in input_vec.iter().enumerate() {
-            let char_5bit: u8 = *char_to_5bit.get(e).expect("Input character not in alphabet");
+            let char_5bit: u8 = *char_to_5bit.get(&(e as u8)).expect(&format!("Input character '{}' not in alphabet", e));
             bit_array.set(i, char_5bit as u64);
         }
 
@@ -131,7 +131,7 @@ impl ProteinText {
     /// * `index` - The index of the character to change.
     /// * `value` - The character to fill in as `u8`.
     pub fn set(&mut self, index: usize, value: u8) {
-        let char_5bit: u8 = *self.char_to_5bit.get(&value).expect("Input character not in alphabet");
+        let char_5bit: u8 = *self.char_to_5bit.get(&value).expect(&format!("Input character '{}' not in alphabet", value));
         self.bit_array.set(index, char_5bit as u64);
     }
 
@@ -477,7 +477,7 @@ mod tests {
 
         let mut bit_array = BitArray::with_capacity(input_string.len(), 5);
         for (i, c) in input_string.chars().enumerate() {
-            let char_5bit: u8 = *char_to_5bit.get(&(c as u8)).expect("Input character not in alphabet");
+            let char_5bit: u8 = *char_to_5bit.get(&(c as u8)).expect(&format!("Input character '{}' not in alphabet", c));
             bit_array.set(i, char_5bit as u64);
         }
 

From 8fdf7155c6315adc13d0c6ec9343623c352b6951 Mon Sep 17 00:00:00 2001
From: Pieter Verschaffelt <pieter.verschaffelt@ugent.be>
Date: Thu, 19 Sep 2024 11:49:05 +0200
Subject: [PATCH 5/9] Fix compilation error

---
 text-compression/src/lib.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/text-compression/src/lib.rs b/text-compression/src/lib.rs
index d56d4b4..4c8ecf9 100644
--- a/text-compression/src/lib.rs
+++ b/text-compression/src/lib.rs
@@ -79,7 +79,7 @@ impl ProteinText {
 
         let mut bit_array = BitArray::with_capacity(input_vec.len(), 5);
         for (i, e) in input_vec.iter().enumerate() {
-            let char_5bit: u8 = *char_to_5bit.get(&(e as u8)).expect(&format!("Input character '{}' not in alphabet", e));
+            let char_5bit: u8 = *char_to_5bit.get(e).expect(&format!("Input character '{}' not in alphabet", e));
             bit_array.set(i, char_5bit as u64);
         }
 

From e308ed3b01e95f84e63d2964d4f04725c57c3e14 Mon Sep 17 00:00:00 2001
From: Pieter Verschaffelt <pieter.verschaffelt@ugent.be>
Date: Thu, 19 Sep 2024 11:52:59 +0200
Subject: [PATCH 6/9] Add support for pseudo amino acids BOUZX

---
 text-compression/src/lib.rs | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/text-compression/src/lib.rs b/text-compression/src/lib.rs
index 4c8ecf9..0f0300c 100644
--- a/text-compression/src/lib.rs
+++ b/text-compression/src/lib.rs
@@ -24,7 +24,7 @@ impl ProteinText {
     /// Returns the hashmap
     fn create_char_to_5bit_hashmap() -> HashMap<u8, u8> {
         let mut hashmap = HashMap::<u8, u8>::new();
-        for (i, c) in "ACDEFGHIKLMNPQRSTVWY-$".chars().enumerate() {
+        for (i, c) in "ABCDEFGHIKLMNOPQRSTUVWXYZ-$".chars().enumerate() {
             hashmap.insert(c as u8, i as u8);
         }
 
@@ -38,7 +38,7 @@ impl ProteinText {
     /// Returns the vector
     fn create_bit5_to_char() -> Vec<u8> {
         let mut vec = Vec::<u8>::new();
-        for c in "ACDEFGHIKLMNPQRSTVWY-$".chars() {
+        for c in "ABCDEFGHIKLMNOPQRSTUVWXYZ-$".chars() {
             vec.push(c as u8);
         }
         vec
@@ -445,7 +445,7 @@ mod tests {
         let char_to_5bit = ProteinText::create_char_to_5bit_hashmap();
         let bit5_to_char = ProteinText::create_bit5_to_char();
 
-        for c in "ACDEFGHIKLMNPQRSTVWY-$".chars() {
+        for c in "ABCDEFGHIKLMNOPQRSTUVWXYZ-$".chars() {
             let char_5bit = char_to_5bit.get(&(c as u8)).unwrap();
             assert_eq!(c as u8, bit5_to_char[*char_5bit as usize]);
         }

From dc7e8b4eee95eb08d853cfff58b72153a955cb31 Mon Sep 17 00:00:00 2001
From: Pieter Verschaffelt <pieter.verschaffelt@ugent.be>
Date: Thu, 19 Sep 2024 14:07:34 +0200
Subject: [PATCH 7/9] Run linter

---
 text-compression/src/lib.rs | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/text-compression/src/lib.rs b/text-compression/src/lib.rs
index 0f0300c..6fa7227 100644
--- a/text-compression/src/lib.rs
+++ b/text-compression/src/lib.rs
@@ -58,7 +58,8 @@ impl ProteinText {
 
         let mut bit_array = BitArray::with_capacity(input_string.len(), 5);
         for (i, c) in input_string.chars().enumerate() {
-            let char_5bit: u8 = *char_to_5bit.get(&(c as u8)).expect(&format!("Input character '{}' not in alphabet", c));
+            let char_5bit: u8 =
+                *char_to_5bit.get(&(c as u8)).expect(&format!("Input character '{}' not in alphabet", c));
             bit_array.set(i, char_5bit as u64);
         }
 
@@ -131,7 +132,8 @@ impl ProteinText {
     /// * `index` - The index of the character to change.
     /// * `value` - The character to fill in as `u8`.
     pub fn set(&mut self, index: usize, value: u8) {
-        let char_5bit: u8 = *self.char_to_5bit.get(&value).expect(&format!("Input character '{}' not in alphabet", value));
+        let char_5bit: u8 =
+            *self.char_to_5bit.get(&value).expect(&format!("Input character '{}' not in alphabet", value));
         self.bit_array.set(index, char_5bit as u64);
     }
 
@@ -477,7 +479,8 @@ mod tests {
 
         let mut bit_array = BitArray::with_capacity(input_string.len(), 5);
         for (i, c) in input_string.chars().enumerate() {
-            let char_5bit: u8 = *char_to_5bit.get(&(c as u8)).expect(&format!("Input character '{}' not in alphabet", c));
+            let char_5bit: u8 =
+                *char_to_5bit.get(&(c as u8)).expect(&format!("Input character '{}' not in alphabet", c));
             bit_array.set(i, char_5bit as u64);
         }
 

From 5094fbf0c156d5bfe96f644ca7166431bd639e53 Mon Sep 17 00:00:00 2001
From: Pieter Verschaffelt <pieter.verschaffelt@ugent.be>
Date: Thu, 19 Sep 2024 14:11:59 +0200
Subject: [PATCH 8/9] Fix warnings generated by clippy

---
 text-compression/src/lib.rs | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/text-compression/src/lib.rs b/text-compression/src/lib.rs
index 6fa7227..337c6a6 100644
--- a/text-compression/src/lib.rs
+++ b/text-compression/src/lib.rs
@@ -59,7 +59,7 @@ impl ProteinText {
         let mut bit_array = BitArray::with_capacity(input_string.len(), 5);
         for (i, c) in input_string.chars().enumerate() {
             let char_5bit: u8 =
-                *char_to_5bit.get(&(c as u8)).expect(&format!("Input character '{}' not in alphabet", c));
+                *char_to_5bit.get(&(c as u8)).unwrap_or_else(|| panic!("Input character '{}' not in alphabet", c));
             bit_array.set(i, char_5bit as u64);
         }
 
@@ -80,7 +80,8 @@ impl ProteinText {
 
         let mut bit_array = BitArray::with_capacity(input_vec.len(), 5);
         for (i, e) in input_vec.iter().enumerate() {
-            let char_5bit: u8 = *char_to_5bit.get(e).expect(&format!("Input character '{}' not in alphabet", e));
+            let char_5bit: u8 =
+                *char_to_5bit.get(e).unwrap_or_else(|| panic!("Input character '{}' not in alphabet", e));
             bit_array.set(i, char_5bit as u64);
         }
 
@@ -132,8 +133,10 @@ impl ProteinText {
     /// * `index` - The index of the character to change.
     /// * `value` - The character to fill in as `u8`.
     pub fn set(&mut self, index: usize, value: u8) {
-        let char_5bit: u8 =
-            *self.char_to_5bit.get(&value).expect(&format!("Input character '{}' not in alphabet", value));
+        let char_5bit: u8 = *self
+            .char_to_5bit
+            .get(&value)
+            .unwrap_or_else(|| panic!("Input character '{}' not in alphabet", value));
         self.bit_array.set(index, char_5bit as u64);
     }
 
@@ -480,7 +483,7 @@ mod tests {
         let mut bit_array = BitArray::with_capacity(input_string.len(), 5);
         for (i, c) in input_string.chars().enumerate() {
             let char_5bit: u8 =
-                *char_to_5bit.get(&(c as u8)).expect(&format!("Input character '{}' not in alphabet", c));
+                *char_to_5bit.get(&(c as u8)).unwrap_or_else(|| panic!("Input character '{}' not in alphabet", c));
             bit_array.set(i, char_5bit as u64);
         }
 

From 9a50fcae96792af35b5bc80fbf1d72b10e292ea1 Mon Sep 17 00:00:00 2001
From: Pieter Verschaffelt <pieter.verschaffelt@ugent.be>
Date: Thu, 19 Sep 2024 14:28:52 +0200
Subject: [PATCH 9/9] Fixed tests because of added symbols

---
 text-compression/src/lib.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/text-compression/src/lib.rs b/text-compression/src/lib.rs
index 337c6a6..338e234 100644
--- a/text-compression/src/lib.rs
+++ b/text-compression/src/lib.rs
@@ -598,7 +598,7 @@ mod tests {
         let mut reader = std::io::BufReader::new(&data[..]);
         let compressed_text = load_compressed_text(&mut reader).unwrap();
 
-        for (i, c) in "CDEFGHIKLM".chars().enumerate() {
+        for (i, c) in "BCDEFGHIKL".chars().enumerate() {
             assert_eq!(compressed_text.get(i), c as u8);
         }
     }