From 40cfd4a5efbbb5be1c94c6660002a13c5e502d9c Mon Sep 17 00:00:00 2001 From: Charles Cowart Date: Sat, 2 Mar 2024 17:16:46 -0800 Subject: [PATCH] bugfix for subset project names bugfix for cases when one project in a sample-sheet is a superset of another project in the same sheet. --- sequence_processing_pipeline/Pipeline.py | 20 +++++++++- .../tests/data/multi-project-sheet.csv | 39 +++++++++++++++++++ .../tests/test_Pipeline.py | 26 +++++++++++++ 3 files changed, 83 insertions(+), 2 deletions(-) create mode 100644 sequence_processing_pipeline/tests/data/multi-project-sheet.csv diff --git a/sequence_processing_pipeline/Pipeline.py b/sequence_processing_pipeline/Pipeline.py index c8881f2d..bbe07ff5 100644 --- a/sequence_processing_pipeline/Pipeline.py +++ b/sequence_processing_pipeline/Pipeline.py @@ -660,8 +660,24 @@ def _get_sample_names_from_sample_sheet(self, project_name): # in a third-party library, convert the data structure to # JSON using the exposed method and obtain from the result. jsn = json_loads(self.sample_sheet.to_json()) - return [x['Sample_Name'] for x in jsn['Data'] if - f'{project_name}_' in x['Sample_Project']] + + results = [] + + for sample in jsn['Data']: + # handle case where project_name includes an appended qiita-id. + if sample['Sample_Project'] == project_name: + results.append(sample['Sample_Name']) + continue + + # handle case where project_name does not include a qiita-id. + # exact matching is required for cases where one project name + # in a sheet is a superset of another project in the same + # sheet. + m = search(r'^(.+)_(\d+)$', sample['Sample_Project']) + if m[1] == project_name: + results.append(sample['Sample_Name']) + + return results def _get_sample_names_from_mapping_file(self, project_name): if project_name is None: diff --git a/sequence_processing_pipeline/tests/data/multi-project-sheet.csv b/sequence_processing_pipeline/tests/data/multi-project-sheet.csv new file mode 100644 index 00000000..10b41110 --- /dev/null +++ b/sequence_processing_pipeline/tests/data/multi-project-sheet.csv @@ -0,0 +1,39 @@ +[Header],,,,,,,,,,, +IEMFileVersion,4,,,,,,,,,, +SheetType,standard_metag,,,,,,,,,, +SheetVersion,100,,,,,,,,,, +Investigator Name,Knight,,,,,,,,,, +Experiment Name,RKL0042,,,,,,,,,, +Date,2020-02-26,,,,,,,,,, +Workflow,GenerateFASTQ,,,,,,,,,, +Application,FASTQ Only,,,,,,,,,, +Assay,Metagenomic,,,,,,,,,, +Description,,,,,,,,,,, +Chemistry,Default,,,,,,,,,, +,,,,,,,,,,, +[Reads],,,,,,,,,,, +150,,,,,,,,,,, +150,,,,,,,,,,, +,,,,,,,,,,, +[Settings],,,,,,,,,,, +ReverseComplement,0,,,,,,,,,, +,,,,,,,,,,, +[Data],,,,,,,,,,, +Lane,Sample_ID,Sample_Name,Sample_Plate,well_id_384,I7_Index_ID,index,I5_Index_ID,index2,Sample_Project,syndna_pool_number,Well_description +1,3A,3A,Something,I23,iTru7_201_03,GATAGGCT,iTru5_09_H,AGAAGGAC,Wisconsin_U19_15445,pool1,3A +1,4A,4A,Something,K23,iTru7_201_04,TTGACAGG,iTru5_10_H,TGACCGTT,Wisconsin_U19_15445,pool1,4A +1,5B,5B,Something,I24,iTru7_209_03,CCTGATTG,iTru5_21_H,GCCTTCTT,Wisconsin_U19_15445,pool1,5B +1,6A,6A,Something,K24,iTru7_209_04,TTGTGTGC,iTru5_22_H,TGGACCAT,Wisconsin_U19_NA_15446,pool1,6A +1,7A,7A,Something,J23,iTru7_402_03,ACTCAGAC,iTru5_109_H,GCATTGGT,Wisconsin_U19_NA_15446,pool1,7A +1,8A,8A,Something,L23,iTru7_402_04,GTCCACAT,iTru5_110_H,TCCAGCAA,Wisconsin_U19_NA_15446,pool1,8A +,,,,,,,,,,, +[Bioinformatics],,,,,,,,,,, +Sample_Project,QiitaID,BarcodesAreRC,ForwardAdapter,ReverseAdapter,HumanFiltering,library_construction_protocol,experiment_design_description,,,, +Wisconsin_U19_15445,15445,False,AACC,GGTT,True,Nextera,Equipment,,,, +Wisconsin_U19_NA_15446,15446,False,AACC,GGTT,True,Nextera,Equipment,,,, +,,,,,,,,,,, +[Contact],,,,,,,,,,, +Email,Sample_Project,,,,,,,,,, +test@lol.com,Wisconsin_U19_15445,,,,,,,,,, +test@lol.com,Wisconsin_U19_NA_15446,,,,,,,,,, +,,,,,,,,,,, diff --git a/sequence_processing_pipeline/tests/test_Pipeline.py b/sequence_processing_pipeline/tests/test_Pipeline.py index 5887dfe4..6f0d37ab 100644 --- a/sequence_processing_pipeline/tests/test_Pipeline.py +++ b/sequence_processing_pipeline/tests/test_Pipeline.py @@ -28,6 +28,7 @@ def setUp(self): makedirs(self.output_file_path, exist_ok=True) self.maxDiff = None self.good_sample_sheet_path = self.path('good-sample-sheet.csv') + self.mp_sheet_path = self.path('multi-project-sheet.csv') self.bad_sample_sheet_path = self.path('duplicate_sample-sample-sheet' '.csv') self.bad_assay_type_path = self.path('bad-sample-sheet-metagenomics' @@ -124,6 +125,31 @@ def test_validate_mapping_file_numeric_ids(self): obs_df = pipeline._validate_mapping_file(tmp.name) self.assertEqual(list(obs_df['sample_name']), exp) + def test_get_sample_names_from_sample_sheet(self): + pipeline = Pipeline(self.good_config_file, self.good_run_id, + self.mp_sheet_path, None, + self.output_file_path, self.qiita_id, + Pipeline.METAGENOMIC_PTYPE) + + # get all names from all projects in the sample-sheet. + # get all names in project 'Wisconsin_U19_15445' + # get all names in project 'Wisconsin_U19_NA_15446' + # attempt to get names from a project not in the sheet. + # what happens when the fully-qualified project name is used + # (includes qiita_id)? + # get all names in project 'Wisconsin_U19_NA_15446' + + params = [None, 'Wisconsin_U19', 'Wisconsin_U19_NA', 'NotAProject', + 'Wisconsin_U19_15445', 'Wisconsin_U19_NA_15446'] + + exps = [{'3A', '4A', '5B', '6A', '7A', '8A'}, {'3A', '4A', '5B'}, + {'6A', '8A', '7A'}, set(), {'3A', '4A', '5B'}, + {'6A', '8A', '7A'}] + + for param, exp in zip(params, exps): + obs = set(pipeline._get_sample_names_from_sample_sheet(param)) + self.assertEqual(obs, exp) + def test_required_file_checks(self): # begin this test by deleting the RunInfo.txt file and verifying that # Pipeline object will raise an Error.