From 40cfd4a5efbbb5be1c94c6660002a13c5e502d9c Mon Sep 17 00:00:00 2001
From: Charles Cowart <ccowart@ucsd.edu>
Date: Sat, 2 Mar 2024 17:16:46 -0800
Subject: [PATCH] bugfix for subset project names

bugfix for cases when one project in a sample-sheet is a superset of
another project in the same sheet.
---
 sequence_processing_pipeline/Pipeline.py      | 20 +++++++++-
 .../tests/data/multi-project-sheet.csv        | 39 +++++++++++++++++++
 .../tests/test_Pipeline.py                    | 26 +++++++++++++
 3 files changed, 83 insertions(+), 2 deletions(-)
 create mode 100644 sequence_processing_pipeline/tests/data/multi-project-sheet.csv

diff --git a/sequence_processing_pipeline/Pipeline.py b/sequence_processing_pipeline/Pipeline.py
index c8881f2d..bbe07ff5 100644
--- a/sequence_processing_pipeline/Pipeline.py
+++ b/sequence_processing_pipeline/Pipeline.py
@@ -660,8 +660,24 @@ def _get_sample_names_from_sample_sheet(self, project_name):
             # in a third-party library, convert the data structure to
             # JSON using the exposed method and obtain from the result.
             jsn = json_loads(self.sample_sheet.to_json())
-            return [x['Sample_Name'] for x in jsn['Data'] if
-                    f'{project_name}_' in x['Sample_Project']]
+
+            results = []
+
+            for sample in jsn['Data']:
+                # handle case where project_name includes an appended qiita-id.
+                if sample['Sample_Project'] == project_name:
+                    results.append(sample['Sample_Name'])
+                    continue
+
+                # handle case where project_name does not include a qiita-id.
+                # exact matching is required for cases where one project name
+                # in a sheet is a superset of another project in the same
+                # sheet.
+                m = search(r'^(.+)_(\d+)$', sample['Sample_Project'])
+                if m[1] == project_name:
+                    results.append(sample['Sample_Name'])
+
+            return results
 
     def _get_sample_names_from_mapping_file(self, project_name):
         if project_name is None:
diff --git a/sequence_processing_pipeline/tests/data/multi-project-sheet.csv b/sequence_processing_pipeline/tests/data/multi-project-sheet.csv
new file mode 100644
index 00000000..10b41110
--- /dev/null
+++ b/sequence_processing_pipeline/tests/data/multi-project-sheet.csv
@@ -0,0 +1,39 @@
+[Header],,,,,,,,,,,
+IEMFileVersion,4,,,,,,,,,,
+SheetType,standard_metag,,,,,,,,,,
+SheetVersion,100,,,,,,,,,,
+Investigator Name,Knight,,,,,,,,,,
+Experiment Name,RKL0042,,,,,,,,,,
+Date,2020-02-26,,,,,,,,,,
+Workflow,GenerateFASTQ,,,,,,,,,,
+Application,FASTQ Only,,,,,,,,,,
+Assay,Metagenomic,,,,,,,,,,
+Description,,,,,,,,,,,
+Chemistry,Default,,,,,,,,,,
+,,,,,,,,,,,
+[Reads],,,,,,,,,,,
+150,,,,,,,,,,,
+150,,,,,,,,,,,
+,,,,,,,,,,,
+[Settings],,,,,,,,,,,
+ReverseComplement,0,,,,,,,,,,
+,,,,,,,,,,,
+[Data],,,,,,,,,,,
+Lane,Sample_ID,Sample_Name,Sample_Plate,well_id_384,I7_Index_ID,index,I5_Index_ID,index2,Sample_Project,syndna_pool_number,Well_description
+1,3A,3A,Something,I23,iTru7_201_03,GATAGGCT,iTru5_09_H,AGAAGGAC,Wisconsin_U19_15445,pool1,3A
+1,4A,4A,Something,K23,iTru7_201_04,TTGACAGG,iTru5_10_H,TGACCGTT,Wisconsin_U19_15445,pool1,4A
+1,5B,5B,Something,I24,iTru7_209_03,CCTGATTG,iTru5_21_H,GCCTTCTT,Wisconsin_U19_15445,pool1,5B
+1,6A,6A,Something,K24,iTru7_209_04,TTGTGTGC,iTru5_22_H,TGGACCAT,Wisconsin_U19_NA_15446,pool1,6A
+1,7A,7A,Something,J23,iTru7_402_03,ACTCAGAC,iTru5_109_H,GCATTGGT,Wisconsin_U19_NA_15446,pool1,7A
+1,8A,8A,Something,L23,iTru7_402_04,GTCCACAT,iTru5_110_H,TCCAGCAA,Wisconsin_U19_NA_15446,pool1,8A
+,,,,,,,,,,,
+[Bioinformatics],,,,,,,,,,,
+Sample_Project,QiitaID,BarcodesAreRC,ForwardAdapter,ReverseAdapter,HumanFiltering,library_construction_protocol,experiment_design_description,,,,
+Wisconsin_U19_15445,15445,False,AACC,GGTT,True,Nextera,Equipment,,,,
+Wisconsin_U19_NA_15446,15446,False,AACC,GGTT,True,Nextera,Equipment,,,,
+,,,,,,,,,,,
+[Contact],,,,,,,,,,,
+Email,Sample_Project,,,,,,,,,,
+test@lol.com,Wisconsin_U19_15445,,,,,,,,,,
+test@lol.com,Wisconsin_U19_NA_15446,,,,,,,,,,
+,,,,,,,,,,,
diff --git a/sequence_processing_pipeline/tests/test_Pipeline.py b/sequence_processing_pipeline/tests/test_Pipeline.py
index 5887dfe4..6f0d37ab 100644
--- a/sequence_processing_pipeline/tests/test_Pipeline.py
+++ b/sequence_processing_pipeline/tests/test_Pipeline.py
@@ -28,6 +28,7 @@ def setUp(self):
         makedirs(self.output_file_path, exist_ok=True)
         self.maxDiff = None
         self.good_sample_sheet_path = self.path('good-sample-sheet.csv')
+        self.mp_sheet_path = self.path('multi-project-sheet.csv')
         self.bad_sample_sheet_path = self.path('duplicate_sample-sample-sheet'
                                                '.csv')
         self.bad_assay_type_path = self.path('bad-sample-sheet-metagenomics'
@@ -124,6 +125,31 @@ def test_validate_mapping_file_numeric_ids(self):
             obs_df = pipeline._validate_mapping_file(tmp.name)
             self.assertEqual(list(obs_df['sample_name']), exp)
 
+    def test_get_sample_names_from_sample_sheet(self):
+        pipeline = Pipeline(self.good_config_file, self.good_run_id,
+                            self.mp_sheet_path, None,
+                            self.output_file_path, self.qiita_id,
+                            Pipeline.METAGENOMIC_PTYPE)
+
+        # get all names from all projects in the sample-sheet.
+        # get all names in project 'Wisconsin_U19_15445'
+        # get all names in project 'Wisconsin_U19_NA_15446'
+        # attempt to get names from a project not in the sheet.
+        # what happens when the fully-qualified project name is used
+        # (includes qiita_id)?
+        # get all names in project 'Wisconsin_U19_NA_15446'
+
+        params = [None, 'Wisconsin_U19', 'Wisconsin_U19_NA', 'NotAProject',
+                  'Wisconsin_U19_15445', 'Wisconsin_U19_NA_15446']
+
+        exps = [{'3A', '4A', '5B', '6A', '7A', '8A'}, {'3A', '4A', '5B'},
+                {'6A', '8A', '7A'}, set(), {'3A', '4A', '5B'},
+                {'6A', '8A', '7A'}]
+
+        for param, exp in zip(params, exps):
+            obs = set(pipeline._get_sample_names_from_sample_sheet(param))
+            self.assertEqual(obs, exp)
+
     def test_required_file_checks(self):
         # begin this test by deleting the RunInfo.txt file and verifying that
         # Pipeline object will raise an Error.