From b6fb6ff65f0331a00c3f603c42ecba14eee5b63f Mon Sep 17 00:00:00 2001 From: Charles Cowart <42684307+charles-cowart@users.noreply.github.com> Date: Tue, 3 Dec 2024 10:48:35 -0800 Subject: [PATCH] Add tellread (#149) * initial add * initial cleanup * first pass at converting TELLREAD scripts * Second pass at integrating tellread scripts * third pass adding tellread * fourth pass * Fifth pass, tested on qiita-rc and then refactored. * Manually merged with current master * Manually merged with master * Updates based on testing in qiita-rc * flake8 * Small fixes * Refactor KISSLoader to be more DRY. * Pipeline.py updated to support changes in qp-klp * Version 2.0 of TellSeq support. Version 2.0 of TellSeq support removes the master tellread.sh script and the drop-in replacement TRConvertJob.py for Job()s that wrap individual steps in the original script. These steps can be used in whole or in part in varying order in the refactored SPP plugin (qp-klp). * Creation tests added for new TellReadJob() class. * flake8 * New sample files added * Added optional parameter to Pipeline() class. Added optional parameter to Pipeline() class that overwrites the values in the lane column of a sample-sheet's data section. This functionality used to reside in the qp-klp plugin and is a common usage pattern. This allows SPP to override the value in a sample-sheet's lane column with the value provided by the user at submission time. * bugfix * Fixes error Fixes error found when post-processing adapter-trimmed fastq files. All files were being moved into one of the project sub-folders, rather than into their associated folders. This appears to be due to recent implementation change. All files are now moved into their correct folder. * Rewrote test * Updated branch to use new DFSheet() functionality * Updated to recent changes in metapool * Update from testing * Updates to TRIntegrateJob based on testing * Updated sample config file * Replaced legacy exit check for tellread * recent updates * Updated tests * Update setup.py to point to merged metapool updates * New tests for slurm polling * Updates * Updates * flake8 * flake8 post merger * Fixed older test * Minor update * Remove lengthy comment * fix test * Updates based on feedback * Update based on feedback * Added renamed file * Refactored sequence counting job Request from Antonio to make TRNormCountsJob more generalized for current and upcoming work. TRNormCountsJob replaced w/SeqCountsJob: * takes a list of paths to fastq and/or fastq.gz files. * runs seqtk to count sequences and bases in parallel. * aggregator code produces a json file of counts from log output. * Update test based on randomness in output generation * Updates based on feedback * Common parse_log() method made default --- README.md | 8 +- sequence_processing_pipeline/Commands.py | 6 + sequence_processing_pipeline/ConvertJob.py | 9 +- sequence_processing_pipeline/FastQCJob.py | 17 - sequence_processing_pipeline/Job.py | 187 ++++++---- sequence_processing_pipeline/NuQCJob.py | 72 ++-- sequence_processing_pipeline/Pipeline.py | 200 +++++++---- sequence_processing_pipeline/SeqCountsJob.py | 147 ++++++++ .../TRIntegrateJob.py | 163 +++++++++ sequence_processing_pipeline/TellReadJob.py | 174 +++++++++ .../aggregate_counts.py | 40 +++ .../contrib/create_picklist.py | 72 ++++ .../contrib/integrate-indices-np.py | 332 ++++++++++++++++++ .../contrib/plot_counts.py | 27 ++ .../scripts/fake_squeue.py | 101 ++++++ .../templates/integrate.sbatch | 67 ++++ .../templates/seq_counts.sbatch | 25 ++ .../templates/tellread-cleanup.sbatch | 13 + .../templates/tellread.sbatch | 48 +++ .../20230906_FS10001773_68_BTR67708-1611.csv | 41 +++ .../tests/data/aggregate_counts_results.json | 36 ++ .../iseq_metagenomic.json | 19 + .../tests/data/fake_sample_index_list.txt | 96 +++++ .../tests/data/files_to_count.txt | 8 + .../tests/data/seq_counts.sbatch | 25 ++ .../seq_counts_logs/seq_count_2679966_1.err | 3 + .../seq_counts_logs/seq_count_2679966_1.out | 2 + .../seq_counts_logs/seq_count_2679966_2.err | 3 + .../seq_counts_logs/seq_count_2679966_2.out | 2 + .../seq_counts_logs/seq_count_2679966_3.err | 3 + .../seq_counts_logs/seq_count_2679966_3.out | 2 + .../seq_counts_logs/seq_count_2679966_4.err | 3 + .../seq_counts_logs/seq_count_2679966_4.out | 2 + .../seq_counts_logs/seq_count_2679966_5.err | 3 + .../seq_counts_logs/seq_count_2679966_5.out | 2 + .../seq_counts_logs/seq_count_2679966_6.err | 3 + .../seq_counts_logs/seq_count_2679966_6.out | 2 + .../seq_counts_logs/seq_count_2679966_7.err | 3 + .../seq_counts_logs/seq_count_2679966_7.out | 2 + .../seq_counts_logs/seq_count_2679966_8.err | 3 + .../seq_counts_logs/seq_count_2679966_8.out | 2 + .../data/tellseq_metag_dummy_sample_sheet.csv | 135 +++++++ .../data/tellseq_output/integrate_test.sbatch | 67 ++++ .../data/tellseq_output/tellread_test.sbatch | 47 +++ .../tests/test_ConvertJob.py | 2 +- .../tests/test_FastQCJob.py | 2 +- .../tests/test_Job.py | 173 ++++++++- .../tests/test_NuQCJob.py | 137 ++++---- .../tests/test_Pipeline.py | 160 +++++---- .../tests/test_SeqCountsJob.py | 74 ++++ .../tests/test_TRIntegrateJob.py | 72 ++++ .../tests/test_TellReadJob.py | 66 ++++ .../tests/test_commands.py | 25 +- .../tests/test_util.py | 32 +- sequence_processing_pipeline/util.py | 13 +- 55 files changed, 2610 insertions(+), 368 deletions(-) create mode 100644 sequence_processing_pipeline/SeqCountsJob.py create mode 100644 sequence_processing_pipeline/TRIntegrateJob.py create mode 100644 sequence_processing_pipeline/TellReadJob.py create mode 100644 sequence_processing_pipeline/aggregate_counts.py create mode 100644 sequence_processing_pipeline/contrib/create_picklist.py create mode 100644 sequence_processing_pipeline/contrib/integrate-indices-np.py create mode 100644 sequence_processing_pipeline/contrib/plot_counts.py create mode 100755 sequence_processing_pipeline/scripts/fake_squeue.py create mode 100644 sequence_processing_pipeline/templates/integrate.sbatch create mode 100644 sequence_processing_pipeline/templates/seq_counts.sbatch create mode 100644 sequence_processing_pipeline/templates/tellread-cleanup.sbatch create mode 100644 sequence_processing_pipeline/templates/tellread.sbatch create mode 100644 sequence_processing_pipeline/tests/data/20230906_FS10001773_68_BTR67708-1611.csv create mode 100644 sequence_processing_pipeline/tests/data/aggregate_counts_results.json create mode 100644 sequence_processing_pipeline/tests/data/fake_sample_index_list.txt create mode 100644 sequence_processing_pipeline/tests/data/files_to_count.txt create mode 100644 sequence_processing_pipeline/tests/data/seq_counts.sbatch create mode 100644 sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_1.err create mode 100644 sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_1.out create mode 100644 sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_2.err create mode 100644 sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_2.out create mode 100644 sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_3.err create mode 100644 sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_3.out create mode 100644 sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_4.err create mode 100644 sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_4.out create mode 100644 sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_5.err create mode 100644 sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_5.out create mode 100644 sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_6.err create mode 100644 sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_6.out create mode 100644 sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_7.err create mode 100644 sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_7.out create mode 100644 sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_8.err create mode 100644 sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_8.out create mode 100644 sequence_processing_pipeline/tests/data/tellseq_metag_dummy_sample_sheet.csv create mode 100644 sequence_processing_pipeline/tests/data/tellseq_output/integrate_test.sbatch create mode 100644 sequence_processing_pipeline/tests/data/tellseq_output/tellread_test.sbatch create mode 100644 sequence_processing_pipeline/tests/test_SeqCountsJob.py create mode 100644 sequence_processing_pipeline/tests/test_TRIntegrateJob.py create mode 100644 sequence_processing_pipeline/tests/test_TellReadJob.py diff --git a/README.md b/README.md index c51fcd0c..594e11aa 100644 --- a/README.md +++ b/README.md @@ -14,7 +14,7 @@ git clone https://github.com/biocore/mg-scripts.git Create a Python3 Conda environment in which to run the notebook: ```bash -conda create -n sp_pipeline 'python==3.9' numpy pandas click scipy matplotlib fastq-pair +conda create --yes -n spp python='python=3.9' scikit-learn pandas numpy nose pep8 flake8 matplotlib jupyter notebook 'seaborn>=0.7.1' pip openpyxl 'seqtk>=1.4' click scipy fastq-pair ``` Activate the Conda environment: @@ -62,3 +62,9 @@ Please note that the setting 'minimap2_databases' is expected to be a list of pa For NuQCJob, minimap2_databases is expected to be the path to a directory containing two subdirectories: 'metagenomic' and 'metatranscriptomic'. Each directory should contain or symlink to the appropriate .mmi files needed for that Assay type. + +Additional TellSeq-related notes: +'spades-cloudspades-0.1', 'tellread-release-novaseqX' or similar directories must be placed in a location available to SPP. +Their paths should be made known to SPP in the configuration files. (See examples for details). +Additional scripts found in sequence_processing_pipeline/contrib were contributed by Daniel and Omar and can be similarly located and configured. + diff --git a/sequence_processing_pipeline/Commands.py b/sequence_processing_pipeline/Commands.py index b2cd5e41..ae971fc9 100644 --- a/sequence_processing_pipeline/Commands.py +++ b/sequence_processing_pipeline/Commands.py @@ -22,8 +22,14 @@ def split_similar_size_bins(data_location_path, max_file_list_size_in_gb, # is now the following: # add one more level to account for project_names nested under ConvertJob # dir. + # this will ignore the _I1_ reads that appear in the integrated result. fastq_paths = glob.glob(data_location_path + '*/*/*.fastq.gz') + # case-specific filter for TellSeq output directories that also contain + # _I1_ files. Ensure paths are still sorted afterwards. + fastq_paths = [x for x in fastq_paths if '_I1_001.fastq.gz' not in x] + fastq_paths = sorted(fastq_paths) + # convert from GB and halve as we sum R1 max_size = (int(max_file_list_size_in_gb) * (2 ** 30) / 2) diff --git a/sequence_processing_pipeline/ConvertJob.py b/sequence_processing_pipeline/ConvertJob.py index 122a4987..dc9b36aa 100644 --- a/sequence_processing_pipeline/ConvertJob.py +++ b/sequence_processing_pipeline/ConvertJob.py @@ -156,7 +156,13 @@ def run(self, callback=None): exec_from=self.log_path, callback=callback) - self.copy_controls_between_projects() + # ConvertJob() is used to process Amplicon as well as Meta*Omic + # runs. Amplicon runs use a dummy sample-sheet generated by + # Pipeline(). For these types of sheets we can't copy controls + # between projects because demuxing is not performed here. + _, sheet_name = split(self.sample_sheet_path) + if sheet_name != 'dummy_sample_sheet.csv': + self.copy_controls_between_projects() except JobFailedError as e: # When a job has failed, parse the logs generated by this specific @@ -169,6 +175,7 @@ def run(self, callback=None): logging.info(f'Successful job: {job_info}') def parse_logs(self): + # overrides Job.parse_logs() w/tailored parse for specific logs. log_path = join(self.output_path, 'Logs') errors = join(log_path, 'Errors.log') diff --git a/sequence_processing_pipeline/FastQCJob.py b/sequence_processing_pipeline/FastQCJob.py index 5e0bf4fc..8db0440b 100644 --- a/sequence_processing_pipeline/FastQCJob.py +++ b/sequence_processing_pipeline/FastQCJob.py @@ -6,7 +6,6 @@ from functools import partial from json import dumps import logging -import glob class FastQCJob(Job): @@ -305,19 +304,3 @@ def _generate_job_script(self): with open(sh_details_fp, 'w') as f: f.write('\n'.join(self.commands)) - - def parse_logs(self): - log_path = join(self.output_path, 'logs') - files = sorted(glob.glob(join(log_path, '*.out'))) - msgs = [] - - for some_file in files: - with open(some_file, 'r') as f: - msgs += [line for line in f.readlines() - # note 'error' is not same - # requirement as found in QCJob. - # ('error:'). This is a very - # generalized filter. - if 'error' in line.lower()] - - return [msg.strip() for msg in msgs] diff --git a/sequence_processing_pipeline/Job.py b/sequence_processing_pipeline/Job.py index b650543e..4121bd7f 100644 --- a/sequence_processing_pipeline/Job.py +++ b/sequence_processing_pipeline/Job.py @@ -1,3 +1,6 @@ +from jinja2 import BaseLoader, TemplateNotFound +from os.path import getmtime +import pathlib from itertools import zip_longest from os import makedirs, walk from os.path import basename, exists, split, join @@ -9,6 +12,26 @@ import logging from inspect import stack import re +from collections import Counter +from glob import glob + + +# taken from https://jinja.palletsprojects.com/en/3.0.x/api/#jinja2.BaseLoader +class KISSLoader(BaseLoader): + def __init__(self, path): + # pin the path for loader to the location sequence_processing_pipeline + # (the location of this file), along w/the relative path to the + # templates directory. + self.path = join(pathlib.Path(__file__).parent.resolve(), path) + + def get_source(self, environment, template): + path = join(self.path, template) + if not exists(path): + raise TemplateNotFound(template) + mtime = getmtime(path) + with open(path) as f: + source = f.read() + return source, path, lambda: mtime == getmtime(path) class Job: @@ -32,6 +55,7 @@ class Job: slurm_status_running) polling_interval_in_seconds = 60 + squeue_retry_in_seconds = 10 def __init__(self, root_dir, output_path, job_name, executable_paths, max_array_length, modules_to_load=None): @@ -103,7 +127,17 @@ def run(self): raise PipelineError("Base class run() method not implemented.") def parse_logs(self): - raise PipelineError("Base class parse_logs() method not implemented.") + # by default, look for anything to parse in the logs directory. + log_path = join(self.output_path, 'logs') + files = sorted(glob(join(log_path, '*'))) + msgs = [] + + for some_file in files: + with open(some_file, 'r') as f: + msgs += [line for line in f.readlines() + if 'error:' in line.lower()] + + return [msg.strip() for msg in msgs] def _which(self, file_path, modules_to_load=None): """ @@ -212,6 +246,41 @@ def _system_call(self, cmd, allow_return_codes=[], callback=None): return {'stdout': stdout, 'stderr': stderr, 'return_code': return_code} + def _query_slurm(self, job_ids): + # query_slurm encapsulates the handling of squeue. + count = 0 + while True: + result = self._system_call("squeue -t all -j " + f"{','.join(job_ids)} " + "-o '%i,%T'") + + if result['return_code'] == 0: + # there was no issue w/squeue, break this loop and + # continue. + break + else: + # there was likely an intermittent issue w/squeue. Pause + # and wait before trying a few more times. If the problem + # persists then report the error and exit. + count += 1 + + if count > 3: + raise ExecFailedError(result['stderr']) + + sleep(Job.squeue_retry_in_seconds) + + lines = result['stdout'].split('\n') + lines.pop(0) # remove header + lines = [x.split(',') for x in lines if x != ''] + + jobs = {} + for job_id, state in lines: + # ensure unique_id is of type string for downstream use. + job_id = str(job_id) + jobs[job_id] = state + + return jobs + def wait_on_job_ids(self, job_ids, callback=None): ''' Wait for the given job-ids to finish running before returning. @@ -229,63 +298,27 @@ def wait_on_job_ids(self, job_ids, callback=None): # ensure all ids are strings to ensure proper working w/join(). job_ids = [str(x) for x in job_ids] - def query_slurm(job_ids): - # internal function query_slurm encapsulates the handling of - # squeue. - count = 0 - while True: - result = self._system_call("squeue -t all -j " - f"{','.join(job_ids)} " - "-o '%F,%A,%T'") - - if result['return_code'] == 0: - # there was no issue w/squeue, break this loop and - # continue. - break - else: - # there was a likely intermittent issue w/squeue. Pause - # and wait before trying a few more times. If the problem - # persists then report the error and exit. - count += 1 - - if count > 3: - raise ExecFailedError(result['stderr']) - - sleep(60) - - lines = result['stdout'].split('\n') - lines.pop(0) # remove header - lines = [x.split(',') for x in lines if x != ''] - - jobs = {} - child_jobs = {} - for job_id, unique_id, state in lines: - jobs[unique_id] = state - - if unique_id != job_id: - child_jobs[unique_id] = job_id # job is a child job - - return jobs, child_jobs - while True: - jobs, child_jobs = query_slurm(job_ids) - - for jid in job_ids: - logging.debug("JOB %s: %s" % (jid, jobs[jid])) - if callback is not None: - callback(jid=jid, status=jobs[jid]) - - children = [x for x in child_jobs if child_jobs[x] == jid] - if len(children) == 0: - logging.debug("\tNO CHILDREN") - for cid in children: - logging.debug("\tCHILD JOB %s: %s" % (cid, jobs[cid])) - status = [jobs[x] in Job.slurm_status_not_running for x in job_ids] - - if set(status) == {True}: - # all jobs either completed successfully or terminated. + # Because query_slurm only returns state on the job-ids we specify, + # the wait process is a simple check to see whether any of the + # states are 'running' states or not. + jobs = self._query_slurm(job_ids) + + # jobs will be a dict of job-ids or array-ids for jobs that + # are array-jobs. the value of jobs[id] will be a state e.g.: + # 'RUNNING', 'FAILED', 'COMPLETED'. + states = [jobs[x] in Job.slurm_status_not_running for x in jobs] + + if set(states) == {True}: + # if all the states are either FAILED or COMPLETED + # then the set of those states no matter how many + # array-jobs there were will ultimately be the set of + # {True}. If not then that means there are still jobs + # that are running. break + logging.debug(f"sleeping {Job.polling_interval_in_seconds} " + "seconds...") sleep(Job.polling_interval_in_seconds) return jobs @@ -345,16 +378,48 @@ def submit_job(self, script_path, job_parameters=None, # to the user. results = self.wait_on_job_ids([job_id], callback=callback) - job_result = {'job_id': job_id, 'job_state': results[job_id]} + if job_id in results: + # job is a non-array job + job_result = {'job_id': job_id, 'job_state': results[job_id]} + else: + # job is an array job + # assume all array jobs in this case will be associated w/job_id. + counts = Counter() + for array_id in results: + counts[results[array_id]] += 1 + + # for array jobs we won't be returning a string representing the + # state of a single job. Instead we're returning a dictionary of + # the number of unique states the set of array-jobs ended up in and + # the number for each one. + job_result = {'job_id': job_id, 'job_state': dict(counts)} if callback is not None: - callback(jid=job_id, status=job_result['job_state']) + if isinstance(job_result['job_state'], dict): + # this is an array job + states = [] + for key in counts: + states.append(f"{key}: {counts[key]}") - if job_result['job_state'] == 'COMPLETED': - return job_result + callback(jid=job_id, status=", ".join(states)) + + else: + # this is a standard job + callback(jid=job_id, status=job_result['job_state']) + + if isinstance(job_result['job_state'], dict): + states = list(job_result['job_state'].keys()) + if states == ['COMPLETED']: + return job_result + else: + raise JobFailedError(f"job {job_id} exited with jobs in the " + f"following states: {', '.join(states)}") else: - raise JobFailedError(f"job {job_id} exited with status " - f"{job_result['job_state']}") + if job_result['job_state'] == 'COMPLETED': + return job_result + else: + raise JobFailedError(f"job {job_id} exited with status " + f"{job_result['job_state']}") def _group_commands(self, cmds): # break list of commands into chunks of max_array_length (Typically diff --git a/sequence_processing_pipeline/NuQCJob.py b/sequence_processing_pipeline/NuQCJob.py index b2ffd3a9..0e05b41d 100644 --- a/sequence_processing_pipeline/NuQCJob.py +++ b/sequence_processing_pipeline/NuQCJob.py @@ -1,8 +1,7 @@ -from jinja2 import BaseLoader, TemplateNotFound from metapool import load_sample_sheet from os import stat, makedirs, rename -from os.path import join, basename, dirname, exists, abspath, getmtime -from sequence_processing_pipeline.Job import Job +from os.path import join, basename, dirname, exists, abspath, split +from sequence_processing_pipeline.Job import Job, KISSLoader from sequence_processing_pipeline.PipelineError import (PipelineError, JobFailedError) from sequence_processing_pipeline.Pipeline import Pipeline @@ -11,28 +10,9 @@ from sequence_processing_pipeline.Commands import split_similar_size_bins from sequence_processing_pipeline.util import iter_paired_files from jinja2 import Environment -import glob +from glob import glob import re from sys import executable -import pathlib - - -# taken from https://jinja.palletsprojects.com/en/3.0.x/api/#jinja2.BaseLoader -class KISSLoader(BaseLoader): - def __init__(self, path): - # pin the path for loader to the location sequence_processing_pipeline - # (the location of this file), along w/the relative path to the - # templates directory. - self.path = join(pathlib.Path(__file__).parent.resolve(), path) - - def get_source(self, environment, template): - path = join(self.path, template) - if not exists(path): - raise TemplateNotFound(template) - mtime = getmtime(path) - with open(path) as f: - source = f.read() - return source, path, lambda: mtime == getmtime(path) logging.basicConfig(level=logging.DEBUG) @@ -128,6 +108,9 @@ def __init__(self, fastq_root_dir, output_path, sample_sheet_path, self.minimum_bytes = 3100 self.fastq_regex = re.compile(r'^(.*)_S\d{1,4}_L\d{3}_R\d_\d{3}' r'\.fastq\.gz$') + self.interleave_fastq_regex = re.compile(r'^(.*)_S\d{1,4}_L\d{3}_R\d' + r'_\d{3}\.interleave\.fastq' + r'\.gz$') self.html_regex = re.compile(r'^(.*)_S\d{1,4}_L\d{3}_R\d_\d{3}\.html$') self.json_regex = re.compile(r'^(.*)_S\d{1,4}_L\d{3}_R\d_\d{3}\.json$') @@ -167,7 +150,7 @@ def _filter_empty_fastq_files(self, filtered_directory, ''' empty_list = [] - files = glob.glob(join(filtered_directory, f'*.{self.suffix}')) + files = glob(join(filtered_directory, f'*.{self.suffix}')) for r1, r2 in iter_paired_files(files): full_path = join(filtered_directory, r1) @@ -194,7 +177,7 @@ def _move_helper(self, completed_files, regex, samples_in_project, dst): substr = regex.search(file_name) if substr is None: raise ValueError(f"{file_name} does not follow naming " - " pattern.") + "pattern.") else: # check if found substring is a member of this # project. Note sample-name != sample-id @@ -214,8 +197,7 @@ def _move_helper(self, completed_files, regex, samples_in_project, dst): for fp in files_to_move: move(fp, dst) - @staticmethod - def _move_trimmed_files(project_name, output_path): + def _move_trimmed_files(self, project_name, output_path): ''' Given output_path, move all fastqs to a new subdir named project_name. :param project_name: The name of the new folder to be created. @@ -229,8 +211,16 @@ def _move_trimmed_files(project_name, output_path): # this directory shouldn't already exist. makedirs(join(output_path, project_name), exist_ok=False) - for trimmed_file in list(glob.glob(pattern)): - move(trimmed_file, join(output_path, project_name)) + sample_ids = [x[0] for x in self.sample_ids + if x[1] == project_name] + + for trimmed_file in list(glob(pattern)): + file_name = split(trimmed_file)[1] + substr = self.interleave_fastq_regex.search(file_name) + if substr is not None: + # only move the sample_ids in this project. + if substr[1] in sample_ids: + move(trimmed_file, join(output_path, project_name)) else: raise ValueError(f"'{output_path}' does not exist") @@ -282,10 +272,9 @@ def run(self, callback=None): for project in self.project_data: project_name = project['Sample_Project'] needs_human_filtering = project['HumanFiltering'] - source_dir = join(self.output_path, project_name) pattern = f"{source_dir}/*.fastq.gz" - completed_files = list(glob.glob(pattern)) + completed_files = list(glob(pattern)) # if the 'only-adapter-filtered' directory exists, move the files # into a unique location so that files from multiple projects @@ -294,7 +283,7 @@ def run(self, callback=None): 'only-adapter-filtered') if exists(trimmed_only_path): - NuQCJob._move_trimmed_files(project_name, trimmed_only_path) + self._move_trimmed_files(project_name, trimmed_only_path) if needs_human_filtering is True: filtered_directory = join(source_dir, 'filtered_sequences') @@ -330,7 +319,7 @@ def run(self, callback=None): # move all html files underneath the subdirectory for this project. pattern = f"{old_html_path}/*.html" - completed_htmls = list(glob.glob(pattern)) + completed_htmls = list(glob(pattern)) self._move_helper(completed_htmls, # Tissue_1_Super_Trizol_S19_L001_R1_001.html self.html_regex, @@ -339,7 +328,7 @@ def run(self, callback=None): # move all json files underneath the subdirectory for this project. pattern = f"{old_json_path}/*.json" - completed_jsons = list(glob.glob(pattern)) + completed_jsons = list(glob(pattern)) self._move_helper(completed_jsons, # Tissue_1_Super_Trizol_S19_L001_R1_001.json self.json_regex, @@ -357,7 +346,7 @@ def _confirm_job_completed(self): # since NuQCJob processes across all projects in a run, there isn't # a need to iterate by project_name and job_id. pattern = f"{self.output_path}/hds-{self.qiita_job_id}.*.completed" - completed_files = list(glob.glob(pattern)) + completed_files = list(glob(pattern)) if completed_files: return True @@ -510,16 +499,3 @@ def _generate_job_script(self, max_bucket_size): pmls_path=self.pmls_path)) return job_script_path - - def parse_logs(self): - log_path = join(self.output_path, 'logs') - # sorted lists give predictable results - files = sorted(glob.glob(join(log_path, '*.out'))) - msgs = [] - - for some_file in files: - with open(some_file, 'r') as f: - msgs += [line for line in f.readlines() - if 'error:' in line.lower()] - - return [msg.strip() for msg in msgs] diff --git a/sequence_processing_pipeline/Pipeline.py b/sequence_processing_pipeline/Pipeline.py index 21b460c2..9a30c2a0 100644 --- a/sequence_processing_pipeline/Pipeline.py +++ b/sequence_processing_pipeline/Pipeline.py @@ -173,25 +173,20 @@ def get_qiita_id_from_sif_fp(fp): hacky_name_pieces_dict = parse_project_name(temp_name) return hacky_name_pieces_dict[QIITA_ID_KEY] - def __init__(self, configuration_file_path, run_id, sample_sheet_path, - mapping_file_path, output_path, qiita_job_id, pipeline_type): + def __init__(self, configuration_file_path, run_id, input_file_path, + output_path, qiita_job_id, pipeline_type, lane_number=None): """ Initialize Pipeline object w/configuration information. :param configuration_file_path: Path to configuration.json file. :param run_id: Used w/search_paths to locate input run_directory. - :param sample_sheet_path: Path to sample-sheet. - :param mapping_file_path: Path to mapping file. + :param input_file_path: Path to sample-sheet or pre-prep file. :param output_path: Path where all pipeline-generated files live. :param qiita_job_id: Qiita Job ID creating this Pipeline. :param pipeline_type: Pipeline type ('Amplicon', 'Metagenomic', etc.) + :param lane_number: (Optional) overwrite lane_number in input_file. """ - if sample_sheet_path is not None and mapping_file_path is not None: - raise PipelineError("sample_sheet_path or mapping_file_path " - "must be defined, but not both.") - - if sample_sheet_path is None and mapping_file_path is None: - raise PipelineError("sample_sheet_path or mapping_file_path " - "must be defined, but not both.") + if input_file_path is None: + raise PipelineError("user_input_file_path cannot be None") if pipeline_type not in Pipeline.pipeline_types: raise PipelineError(f"'{type}' is not a valid pipeline type.") @@ -235,22 +230,36 @@ def __init__(self, configuration_file_path, run_id, sample_sheet_path, self.run_id = run_id self.qiita_job_id = qiita_job_id self.pipeline = [] + self.assay_type = None - if sample_sheet_path: - self.search_paths = self.configuration['search_paths'] - self.sample_sheet = self._validate_sample_sheet(sample_sheet_path) - self.mapping_file = None - else: + # this method will catch a run directory as well as its products + # directory, which also has the same name. Hence, return the + # shortest matching path as that will at least return the right + # path between the two. + results = [] + + if pipeline_type == Pipeline.AMPLICON_PTYPE: self.search_paths = self.configuration['amplicon_search_paths'] - self.mapping_file = self._validate_mapping_file(mapping_file_path) - # unlike _validate_sample_sheet() which returns a SampleSheet - # object that stores the path to the file it was created from, - # _validate_mapping_file() just returns a DataFrame. Store the - # path to the original mapping file itself as well. - self.mapping_file_path = mapping_file_path - self.sample_sheet = None + self.assay_type = Pipeline.AMPLICON_ATYPE + else: + self.search_paths = self.configuration['search_paths'] + + for search_path in self.search_paths: + logging.debug(f'Searching {search_path} for {self.run_id}') + for entry in listdir(search_path): + some_path = join(search_path, entry) + # ensure some_path never ends in '/' + some_path = some_path.rstrip('/') + if isdir(some_path) and some_path.endswith(self.run_id): + logging.debug(f'Found {some_path}') + results.append(some_path) - self.run_dir = self._search_for_run_dir() + if results: + results.sort(key=lambda s: len(s)) + self.run_dir = results[0] + else: + raise PipelineError(f"A run-dir for '{self.run_id}' could not be " + "found") # required files for successful operation # both RTAComplete.txt and RunInfo.xml should reside in the root of @@ -268,14 +277,78 @@ def __init__(self, configuration_file_path, run_id, sample_sheet_path, except PermissionError: raise PipelineError('RunInfo.xml is present, but not readable') - if self.mapping_file is not None: + self.input_file_path = input_file_path + + if pipeline_type == Pipeline.AMPLICON_PTYPE: + # assume input_file_path references a pre-prep (mapping) file. + + self.mapping_file = self._validate_mapping_file(input_file_path) + # unlike _validate_sample_sheet() which returns a SampleSheet + # object that stores the path to the file it was created from, + # _validate_mapping_file() just returns a DataFrame. Store the + # path to the original mapping file itself as well. + # create dummy sample-sheet output_fp = join(output_path, 'dummy_sample_sheet.csv') self.generate_dummy_sample_sheet(self.run_dir, output_fp) - self.sample_sheet = output_fp + self.dummy_sheet_path = output_fp + + # Optional lane_number parameter is ignored for Amplicon + # runs, as the only valid value is 1. + else: + if lane_number is not None: + # confirm that the lane_number is a reasonable value. + lane_number = int(lane_number) + if lane_number < 1 or lane_number > 8: + raise ValueError(f"'{lane_number}' is not a valid name" + " number") + + # overwrite sample-sheet w/DFSheets processed version + # with overwritten Lane number. + sheet = load_sample_sheet(input_file_path) + with open(input_file_path, 'w') as f: + sheet.write(f, lane=lane_number) + + # assume user_input_file_path references a sample-sheet. + self.sample_sheet = self._validate_sample_sheet(input_file_path) + self.mapping_file = None + + if self.assay_type is None: + # set self.assay_type for non-amplicon types. + assay_type = self.sample_sheet.Header['Assay'] + if assay_type not in Pipeline.assay_types: + raise ValueError(f"'{assay_type} is not a valid Assay type") + self.assay_type = assay_type self._configure_profile() + def get_sample_sheet_path(self): + """ + Returns path to a sample-sheet or dummy sample-sheet for amplicon runs. + """ + if self.assay_type == Pipeline.AMPLICON_ATYPE: + # assume self.dummy_sheet_path has been created for amplicon runs. + return self.dummy_sheet_path + else: + # assume input_file_path is a sample-sheet for non-amplicon runs. + return self.input_file_path + + def get_software_configuration(self, software): + if software is None or software == "": + raise ValueError(f"'{software}' is not a valid value") + + key_order = ['profile', 'configuration', software] + + config = self.config_profile + + for key in key_order: + if key in config: + config = config[key] + else: + raise PipelineError(f"'{key}' is not defined in configuration") + + return config + def identify_reserved_words(self, words): ''' Returns a list of words that should not appear as column names in any @@ -294,7 +367,7 @@ def identify_reserved_words(self, words): # specifically how the proper set of prep-info file columns are # generated. For now the functionality will be defined here as this # area of metapool is currently in flux. - if self.mapping_file is not None: + if self.pipeline_type == Pipeline.AMPLICON_PTYPE: reserved = PREP_MF_COLUMNS else: # results will be dependent on SheetType and SheetVersion of @@ -313,15 +386,6 @@ def _configure_profile(self): # from self.sample_sheet (or self.mapping_file). instr_type = InstrumentUtils.get_instrument_type(self.run_dir) - if isinstance(self.sample_sheet, str): - # if self.sample_sheet is a file instead of a KLSampleSheet() - # type, then this is an Amplicon run. - assay_type = Pipeline.AMPLICON_ATYPE - else: - assay_type = self.sample_sheet.Header['Assay'] - if assay_type not in Pipeline.assay_types: - raise ValueError(f"'{assay_type} is not a valid Assay type") - # open the configuration profiles directory as specified by # profiles_path in the configuration.json file. parse each json into # a nested dictionary keyed by (instrument-type, assay-type) as @@ -381,40 +445,17 @@ def _configure_profile(self): i_type = profile['profile']['instrument_type'] a_type = profile['profile']['assay_type'] - if i_type == instr_type and a_type == assay_type: + if i_type == instr_type and a_type == self.assay_type: selected_profile = profile break if selected_profile is None: - raise ValueError(f"a matching profile ({instr_type}, {assay_type}" - ") was not found. Please notify an administrator") + raise ValueError(f"a matching profile ({instr_type}, " + f"{self.assay_type}) was not found. Please notify" + " an administrator") self.config_profile = selected_profile - def _search_for_run_dir(self): - # this method will catch a run directory as well as its products - # directory, which also has the same name. Hence, return the - # shortest matching path as that will at least return the right - # path between the two. - results = [] - - for search_path in self.search_paths: - logging.debug(f'Searching {search_path} for {self.run_id}') - for entry in listdir(search_path): - some_path = join(search_path, entry) - # ensure some_path never ends in '/' - some_path = some_path.rstrip('/') - if isdir(some_path) and some_path.endswith(self.run_id): - logging.debug(f'Found {some_path}') - results.append(some_path) - - if results: - results.sort(key=lambda s: len(s)) - return results[0] - - raise PipelineError(f"A run-dir for '{self.run_id}' could not be " - "found") - def _directory_check(self, directory_path, create=False): if exists(directory_path): logging.debug("directory '%s' exists." % directory_path) @@ -591,7 +632,7 @@ def generate_sample_info_files(self, addl_info=None): :param addl_info: A df of (sample-name, project-name) pairs. :return: A list of paths to sample-information-files. """ - if self.mapping_file is not None: + if self.pipeline_type == Pipeline.AMPLICON_PTYPE: # Generate a list of BLANKs for each project. temp_df = self.mapping_file[[SAMPLE_NAME_KEY, _PROJECT_NAME_KEY]] temp_df_as_dicts_list = temp_df.to_dict(orient='records') @@ -670,7 +711,7 @@ def get_sample_ids(self): # test for self.mapping_file, since self.sample_sheet will be # defined in both cases. - if self.mapping_file is not None: + if self.pipeline_type == Pipeline.AMPLICON_PTYPE: results = list(self.mapping_file.sample_name) else: results = [x.Sample_ID for x in self.sample_sheet.samples] @@ -685,7 +726,7 @@ def get_sample_names(self, project_name=None): ''' # test for self.mapping_file, since self.sample_sheet will be # defined in both cases. - if self.mapping_file is not None: + if self.pipeline_type == Pipeline.AMPLICON_PTYPE: return self._get_sample_names_from_mapping_file(project_name) else: return self._get_sample_names_from_sample_sheet(project_name) @@ -775,12 +816,9 @@ def _parse_project_name(self, project_name, short_names): return proj_info[PROJECT_SHORT_NAME_KEY], proj_info[QIITA_ID_KEY] def get_project_info(self, short_names=False): - # test for self.mapping_file, since self.sample_sheet will be - # defined in both cases. results = [] - contains_replicates = None - if self.mapping_file is not None: + if self.pipeline_type == Pipeline.AMPLICON_PTYPE: if CONTAINS_REPLICATES_KEY in self.mapping_file: contains_replicates = True else: @@ -792,25 +830,35 @@ def get_project_info(self, short_names=False): {p: parse_project_name(p) for p in sample_project_map} else: projects_info = self.sample_sheet.get_projects_details() - # endif mapping_file if short_names: proj_name_key = PROJECT_SHORT_NAME_KEY else: proj_name_key = PROJECT_FULL_NAME_KEY - # endif + for curr_project_info in projects_info.values(): curr_dict = { _PROJECT_NAME_KEY: curr_project_info[proj_name_key], QIITA_ID_KEY: curr_project_info[QIITA_ID_KEY] } - if contains_replicates is not None: + if self.pipeline_type == Pipeline.AMPLICON_PTYPE: + # this is a mapping file: curr_contains_reps = contains_replicates else: - curr_contains_reps = \ - curr_project_info.get(CONTAINS_REPLICATES_KEY, False) - # endif + bi_df = self.sample_sheet.Bioinformatics + if CONTAINS_REPLICATES_KEY in bi_df.columns.tolist(): + # subselect rows in [Bioinformatics] based on whether they + # match the project name. + df = bi_df.loc[bi_df['Sample_Project'] == + curr_project_info[proj_name_key]] + # since only one project can match by definition, convert + # to dict and extract the needed value. + curr_contains_reps = df.iloc[0].to_dict()[ + CONTAINS_REPLICATES_KEY] + else: + curr_contains_reps = False + curr_dict[CONTAINS_REPLICATES_KEY] = curr_contains_reps results.append(curr_dict) # next project diff --git a/sequence_processing_pipeline/SeqCountsJob.py b/sequence_processing_pipeline/SeqCountsJob.py new file mode 100644 index 00000000..f080bd00 --- /dev/null +++ b/sequence_processing_pipeline/SeqCountsJob.py @@ -0,0 +1,147 @@ +from os.path import join, split +from .Job import Job, KISSLoader +from .PipelineError import JobFailedError +import logging +from jinja2 import Environment +from os import walk +from json import dumps +from glob import glob + + +logging.basicConfig(level=logging.DEBUG) + + +class SeqCountsJob(Job): + def __init__(self, run_dir, output_path, queue_name, + node_count, wall_time_limit, jmem, modules_to_load, + qiita_job_id, max_array_length, files_to_count_path, + cores_per_task=4): + """ + ConvertJob provides a convenient way to run bcl-convert or bcl2fastq + on a directory BCL files to generate Fastq files. + :param run_dir: The 'run' directory that contains BCL files. + :param output_path: Path where all pipeline-generated files live. + :param queue_name: The name of the Torque queue to use for processing. + :param node_count: The number of nodes to request. + :param wall_time_limit: A hard time limit (in min) to bound processing. + :param jmem: String representing total memory limit for entire job. + :param modules_to_load: A list of Linux module names to load + :param qiita_job_id: identify Torque jobs using qiita_job_id + :param max_array_length: A hard-limit for array-sizes + :param files_to_count_path: A path to a list of file-paths to count. + :param cores_per_task: (Optional) # of CPU cores per node to request. + """ + super().__init__(run_dir, + output_path, + 'SeqCountsJob', + [], + max_array_length, + modules_to_load=modules_to_load) + + self.queue_name = queue_name + self.node_count = node_count + self.wall_time_limit = wall_time_limit + self.cores_per_task = cores_per_task + + # raise an Error if jmem is not a valid floating point value. + self.jmem = str(int(jmem)) + self.qiita_job_id = qiita_job_id + self.jinja_env = Environment(loader=KISSLoader('templates')) + + self.job_name = (f"seq_counts_{self.qiita_job_id}") + self.files_to_count_path = files_to_count_path + + with open(self.files_to_count_path, 'r') as f: + lines = f.readlines() + lines = [x.strip() for x in lines] + lines = [x for x in lines if x != ''] + self.file_count = len(lines) + + def run(self, callback=None): + job_script_path = self._generate_job_script() + params = ['--parsable', + f'-J {self.job_name}', + f'--array 1-{self.sample_count}'] + try: + self.job_info = self.submit_job(job_script_path, + job_parameters=' '.join(params), + exec_from=None, + callback=callback) + + logging.debug(f'SeqCountsJob Job Info: {self.job_info}') + except JobFailedError as e: + # When a job has failed, parse the logs generated by this specific + # job to return a more descriptive message to the user. + info = self.parse_logs() + # prepend just the message component of the Error. + info.insert(0, str(e)) + raise JobFailedError('\n'.join(info)) + + self._aggregate_counts() + + logging.debug(f'SeqCountJob {self.job_info["job_id"]} completed') + + def _generate_job_script(self): + job_script_path = join(self.output_path, "seq_counts.sbatch") + template = self.jinja_env.get_template("seq_counts.sbatch") + + # got to make files_to_count.txt and put it in the output directory + + with open(job_script_path, mode="w", encoding="utf-8") as f: + f.write(template.render({ + "job_name": "seq_counts", + "wall_time_limit": self.wall_time_limit, + "mem_in_gb": self.jmem, + "node_count": self.node_count, + "cores_per_task": self.cores_per_task, + "queue_name": self.queue_name, + "file_count": self.file_count, + "output_path": self.output_path + })) + + return job_script_path + + def parse_logs(self): + # overrides Job.parse_logs() w/tailored parse for specific logs. + files = sorted(glob(join(self.log_path, '*.err'))) + msgs = [] + + for some_file in files: + with open(some_file, 'r') as f: + msgs += [line for line in f.readlines() + if line.startswith("[E::stk_size]")] + + return [msg.strip() for msg in msgs] + + def _aggregate_counts(self): + def extract_metadata(fp): + with open(fp, 'r') as f: + lines = f.readlines() + lines = [x.strip() for x in lines] + if len(lines) != 2: + raise ValueError("error processing %s" % fp) + _dir, _file = split(lines[0]) + seq_counts, base_pairs = lines[1].split('\t') + return _dir, _file, int(seq_counts), int(base_pairs) + + results = {} + + for root, dirs, files in walk(self.log_path): + for _file in files: + if _file.endswith('.out'): + log_output_file = join(root, _file) + _dir, _file, seq_counts, base_pairs = \ + extract_metadata(log_output_file) + + if _dir not in results: + results[_dir] = {} + + results[_dir][_file] = {'seq_counts': seq_counts, + 'base_pairs': base_pairs} + + results_path = join(self.output_path, 'aggregate_counts.json') + + with open(results_path, 'w') as f: + print(dumps(results, indent=2), file=f) + + return results_path diff --git a/sequence_processing_pipeline/TRIntegrateJob.py b/sequence_processing_pipeline/TRIntegrateJob.py new file mode 100644 index 00000000..7b8740b4 --- /dev/null +++ b/sequence_processing_pipeline/TRIntegrateJob.py @@ -0,0 +1,163 @@ +from os.path import join +from .Job import Job, KISSLoader +from .PipelineError import JobFailedError +import logging +from jinja2 import Environment +from .Pipeline import Pipeline +from .PipelineError import PipelineError +from metapool import load_sample_sheet +from os import makedirs +from shutil import copyfile + + +logging.basicConfig(level=logging.DEBUG) + + +class TRIntegrateJob(Job): + def __init__(self, run_dir, output_path, sample_sheet_path, queue_name, + node_count, wall_time_limit, jmem, modules_to_load, + qiita_job_id, integrate_script_path, sil_path, raw_fastq_dir, + reference_base, reference_map, cores_per_task): + """ + ConvertJob provides a convenient way to run bcl-convert or bcl2fastq + on a directory BCL files to generate Fastq files. + :param run_dir: The 'run' directory that contains BCL files. + :param output_path: Path where all pipeline-generated files live. + :param sample_sheet_path: The path to a sample-sheet. + :param queue_name: The name of the Torque queue to use for processing. + :param node_count: The number of nodes to request. + :param wall_time_limit: A hard time limit (in min) to bound processing. + :param jmem: String representing total memory limit for entire job. + :param modules_to_load: A list of Linux module names to load + :param qiita_job_id: identify Torque jobs using qiita_job_id + :param integrate_script_path: None + :param sil_path: A path to a confidential file mapping C5xx, adapters. + :param reference_base: None + :param reference_map: None + :param cores_per_task: # of CPU cores per node to request. + """ + super().__init__(run_dir, + output_path, + 'TRIntegrateJob', + [], + # max_array_length and self.max_array_length are + # not used by TRIntegrateJob. + -1, + modules_to_load=modules_to_load) + + self.sample_sheet_path = sample_sheet_path + self._file_check(self.sample_sheet_path) + metadata = self._process_sample_sheet() + self.sample_ids = metadata['sample_ids'] + self.queue_name = queue_name + self.node_count = node_count + self.wall_time_limit = wall_time_limit + self.cores_per_task = cores_per_task + self.integrate_script_path = integrate_script_path + self.sil_path = sil_path + self.raw_fastq_dir = raw_fastq_dir + self.tmp_dir = join(self.output_path, 'tmp') + + self.reference_base = reference_base + self.reference_map = reference_map + + # raise an Error if jmem is not a valid floating point value. + self.jmem = str(int(jmem)) + self.qiita_job_id = qiita_job_id + self.sample_count = len(self.sample_ids) + self.jinja_env = Environment(loader=KISSLoader('templates')) + self.job_name = (f"integrate_{self.qiita_job_id}") + + with open(self.sil_path, 'r') as f: + # obtain the number of unique barcode_ids as determined by + # TellReadJob() in order to set up an array job of the + # proper length. + lines = f.readlines() + lines = [x.strip() for x in lines] + lines = [x for x in lines if x != ''] + self.barcode_id_count = len(lines) + + def run(self, callback=None): + job_script_path = self._generate_job_script() + + # copy sil_path to TRIntegrate working directory and rename to a + # predictable name. + copyfile(self.sil_path, + join(self.output_path, 'sample_index_list.txt')) + + # generate the tailored subset of adapter to barcode_id based on + # the proprietary lists owned by the manufacturer and supplied by + # the caller, and the barcode ids found in the sample-sheet. + self._generate_sample_index_list() + + makedirs(self.tmp_dir) + + params = ['--parsable', + f'-J {self.job_name}', + f'--array 1-{self.sample_count}'] + try: + self.job_info = self.submit_job(job_script_path, + job_parameters=' '.join(params), + exec_from=None, + callback=callback) + + logging.debug(f'TRIntegrateJob Job Info: {self.job_info}') + except JobFailedError as e: + # When a job has failed, parse the logs generated by this specific + # job to return a more descriptive message to the user. + info = self.parse_logs() + # prepend just the message component of the Error. + info.insert(0, str(e)) + raise JobFailedError('\n'.join(info)) + + logging.debug(f'TRIntegrateJob {self.job_info["job_id"]} completed') + + def _process_sample_sheet(self): + sheet = load_sample_sheet(self.sample_sheet_path) + + if not sheet.validate_and_scrub_sample_sheet(): + s = "Sample sheet %s is not valid." % self.sample_sheet_path + raise PipelineError(s) + + header = sheet.Header + chemistry = header['chemistry'] + + if header['Assay'] not in Pipeline.assay_types: + s = "Assay value '%s' is not recognized." % header['Assay'] + raise PipelineError(s) + + sample_ids = [] + for sample in sheet.samples: + sample_ids.append((sample['Sample_ID'], sample['Sample_Project'])) + + bioinformatics = sheet.Bioinformatics + + # reorganize the data into a list of dictionaries, one for each row. + # the ordering of the rows will be preserved in the order of the list. + lst = bioinformatics.to_dict('records') + + # human-filtering jobs are scoped by project. Each job requires + # particular knowledge of the project. + return {'chemistry': chemistry, + 'projects': lst, + 'sample_ids': sample_ids} + + def _generate_job_script(self): + job_script_path = join(self.output_path, 'integrate_test.sbatch') + template = self.jinja_env.get_template("integrate.sbatch") + + with open(job_script_path, mode="w", encoding="utf-8") as f: + f.write(template.render({ + "job_name": "integrate", + "wall_time_limit": self.wall_time_limit, + "mem_in_gb": self.jmem, + "node_count": self.node_count, + "cores_per_task": self.cores_per_task, + "integrate_script_path": self.integrate_script_path, + "queue_name": self.queue_name, + "barcode_id_count": self.barcode_id_count, + "raw_fastq_dir": self.raw_fastq_dir, + "tmp_dir": self.tmp_dir, + "output_dir": self.output_path})) + + return job_script_path diff --git a/sequence_processing_pipeline/TellReadJob.py b/sequence_processing_pipeline/TellReadJob.py new file mode 100644 index 00000000..3d68d4c8 --- /dev/null +++ b/sequence_processing_pipeline/TellReadJob.py @@ -0,0 +1,174 @@ +from os.path import join +from .Job import Job, KISSLoader +from .PipelineError import JobFailedError +import logging +from jinja2 import Environment +from .Pipeline import Pipeline +from .PipelineError import PipelineError +from metapool import load_sample_sheet + + +logging.basicConfig(level=logging.DEBUG) + + +class TellReadJob(Job): + def __init__(self, run_dir, output_path, sample_sheet_path, queue_name, + node_count, wall_time_limit, jmem, modules_to_load, + qiita_job_id, reference_base, + reference_map, sing_script_path, cores_per_task): + """ + ConvertJob provides a convenient way to run bcl-convert or bcl2fastq + on a directory BCL files to generate Fastq files. + :param run_dir: The 'run' directory that contains BCL files. + :param output_path: Path where all pipeline-generated files live. + :param sample_sheet_path: The path to a sample-sheet. + :param queue_name: The name of the Torque queue to use for processing. + :param node_count: The number of nodes to request. + :param wall_time_limit: A hard time limit (in min) to bound processing. + :param jmem: String representing total memory limit for entire job. + :param modules_to_load: A list of Linux module names to load + :param qiita_job_id: identify Torque jobs using qiita_job_id + :param reference_base: None + :param reference_map: None + :param cores_per_task: (Optional) # of CPU cores per node to request. + """ + super().__init__(run_dir, + output_path, + 'TellReadJob', + [], + 1, + modules_to_load=modules_to_load) + + self.sample_sheet_path = sample_sheet_path + self._file_check(self.sample_sheet_path) + metadata = self._process_sample_sheet() + self.sample_ids = metadata['sample_ids'] + self.queue_name = queue_name + self.node_count = node_count + self.wall_time_limit = wall_time_limit + self.cores_per_task = cores_per_task + + self.reference_base = reference_base + self.reference_map = reference_map + + # raise an Error if jmem is not a valid floating point value. + self.jmem = str(int(jmem)) + self.qiita_job_id = qiita_job_id + self.jinja_env = Environment(loader=KISSLoader('templates')) + self.sing_script_path = sing_script_path + + sheet = load_sample_sheet(self.sample_sheet_path) + lane = sheet.samples[0].Lane + + # force self.lane_number to be int. raise an Error if it's not. + tmp = int(lane) + if tmp < 1 or tmp > 8: + raise ValueError(f"'{tmp}' is not a valid lane number") + self.lane_number = tmp + + self.job_name = (f"{self.qiita_job_id}-tellread") + + def run(self, callback=None): + job_script_path = self._generate_job_script() + + # everything is in the job script so there are no additional params. + params = [] + + try: + self.job_info = self.submit_job(job_script_path, + job_parameters=' '.join(params), + exec_from=None, + callback=callback) + + logging.debug(f'TellReadJob Job Info: {self.job_info}') + except JobFailedError as e: + # When a job has failed, parse the logs generated by this specific + # job to return a more descriptive message to the user. + # TODO: We need more examples of failed jobs before we can create + # a parser for the logs. + # info = self.parse_logs() + # prepend just the message component of the Error. + # info.insert(0, str(e)) + info = str(e) + raise JobFailedError('\n'.join(info)) + + logging.debug(f'TellReadJob {self.job_info["job_id"]} completed') + + def _process_sample_sheet(self): + sheet = load_sample_sheet(self.sample_sheet_path) + + if not sheet.validate_and_scrub_sample_sheet(): + s = "Sample sheet %s is not valid." % self.sample_sheet_path + raise PipelineError(s) + + header = sheet.Header + chemistry = header['chemistry'] + + if header['Assay'] not in Pipeline.assay_types: + s = "Assay value '%s' is not recognized." % header['Assay'] + raise PipelineError(s) + + sample_ids = [] + for sample in sheet.samples: + sample_ids.append((sample['Sample_ID'], + sample['Sample_Project'], + sample['barcode_id'])) + + bioinformatics = sheet.Bioinformatics + + # reorganize the data into a list of dictionaries, one for each row. + # the ordering of the rows will be preserved in the order of the list. + lst = bioinformatics.to_dict('records') + + # human-filtering jobs are scoped by project. Each job requires + # particular knowledge of the project. + return {'chemistry': chemistry, + 'projects': lst, + 'sample_ids': sample_ids} + + def _generate_job_script(self): + job_script_path = join(self.output_path, 'tellread_test.sbatch') + template = self.jinja_env.get_template("tellread.sbatch") + + # generate a comma separated list of sample-ids from the tuples stored + # in self.sample_ids. + + # NB: Proposed sample-sheets will have traditional Sample_ID and + # Sample_Name columns as well as a new value named barcode_id. It's + # this column that will contain the 'C50n' values needed to be + # supplied to tellread. Later we will use this mapping to rename the + # files from C50n...fastq.gz to sample-name...fastq.gz. + samples = ','.join([id[2] for id in self.sample_ids]) + + # since we haven't included support for reference_map yet, whenever a + # reference is not included, the mapping against the list of sample_ids + # is ['NONE', 'NONE', ..., 'NONE']. + refs = ','.join(['NONE' for _ in self.sample_ids]) + + extra = "" + + with open(job_script_path, mode="w", encoding="utf-8") as f: + f.write(template.render({ + "job_name": "tellread", + "wall_time_limit": self.wall_time_limit, + "mem_in_gb": self.jmem, + "node_count": self.node_count, + "cores_per_task": self.cores_per_task, + "queue_name": self.queue_name, + "sing_script_path": self.sing_script_path, + "modules_to_load": ' '.join(self.modules_to_load), + "lane": f"s_{self.lane_number}", + # NB: Note that we no longer create a sub-directory under the + # working directory for TellRead to create all its output + # folders and files. This means it is creating folders and + # files in the same directory that has our sbatch script and + # logs directory. Currently there are no name collisions, + # however. + "output": self.output_path, + "rundir_path": self.root_dir, + "samples": samples, + "refs": refs, + "extra": extra + })) + + return job_script_path diff --git a/sequence_processing_pipeline/aggregate_counts.py b/sequence_processing_pipeline/aggregate_counts.py new file mode 100644 index 00000000..ace90212 --- /dev/null +++ b/sequence_processing_pipeline/aggregate_counts.py @@ -0,0 +1,40 @@ +from os import walk +from sys import argv +from os.path import join, split +from json import dumps + + +def extract_metadata(log_output_file_path): + with open(log_output_file_path, 'r') as f: + lines = f.readlines() + lines = [x.strip() for x in lines] + if len(lines) != 2: + raise ValueError("error processing %s" % log_output_file_path) + _dir, _file = split(lines[0]) + seq_counts, base_pairs = lines[1].split('\t') + return _dir, _file, int(seq_counts), int(base_pairs) + + +def aggregate_counts(fp): + results = {} + + for root, dirs, files in walk(fp): + for _file in files: + if _file.endswith('.out'): + log_output_file = join(root, _file) + _dir, _file, seq_counts, base_pairs = \ + extract_metadata(log_output_file) + + if _dir not in results: + results[_dir] = {} + + results[_dir][_file] = {'seq_counts': seq_counts, + 'base_pairs': base_pairs} + + return results + + +if __name__ == '__main__': + results = aggregate_counts(argv[1]) + with open(argv[2], 'w') as f: + print(dumps(results, indent=2), file=f) diff --git a/sequence_processing_pipeline/contrib/create_picklist.py b/sequence_processing_pipeline/contrib/create_picklist.py new file mode 100644 index 00000000..a1d6a1d0 --- /dev/null +++ b/sequence_processing_pipeline/contrib/create_picklist.py @@ -0,0 +1,72 @@ +import os +# from metapool.metapool import * +from sys import argv +import pandas as pd +import matplotlib.pyplot as plt +from metapool.metapool import (read_survival, make_2D_array, + calculate_iseqnorm_pooling_volumes, + format_pooling_echo_pick_list) +import seaborn as sns + +input_sheet_filename = argv[1] + +plate_df_w_reads = pd.read_csv(input_sheet_filename, sep='\t') +plate_df_w_reads['Blank'] = [True if 'blank' in s.lower() else False + for s in plate_df_w_reads['Sample_Name']] +reads_column = 'read_counts' + +well_col = 'Sample_Well' +assert reads_column in plate_df_w_reads.columns + +f, ((ax1, ax2), (ax3, ax4)) = plt.subplots(nrows=2, ncols=2, figsize=(8, 8)) +# evenness plot +rmax = int(round(plate_df_w_reads[reads_column].max(), -2)) + +foo = read_survival(plate_df_w_reads.loc[plate_df_w_reads['Blank'] is True, + reads_column], + label='Blanks', + rmax=rmax) + +bar = read_survival(plate_df_w_reads.loc[plate_df_w_reads['Blank'] is False, + reads_column], + label='Samples', + rmax=rmax) + +survival_df = pd.concat([foo, bar]) + +ax3.set_xlabel(reads_column) +ax3.set_ylabel('Samples') +survival_df.plot(color=['coral', 'steelblue'], ax=ax1) +ax1.set_xlabel(reads_column) +ax1.set_ylabel('Samples') + +# Histogram +sns.histplot(plate_df_w_reads[reads_column], ax=ax3) + +# Boxplot +sns.boxplot(x="Blank", y=reads_column, data=plate_df_w_reads, ax=ax4) +sns.stripplot(x="Blank", y=reads_column, data=plate_df_w_reads, ax=ax4, + size=3, color='black', alpha=0.5) + +plt.tight_layout() +plt.savefig(input_sheet_filename + '.comboplot.pdf') + +pdfn = calculate_iseqnorm_pooling_volumes(plate_df_w_reads, + dynamic_range=20, + normalization_column=reads_column) +plt.savefig(input_sheet_filename + '.normalizedplot.pdf') + +vols = make_2D_array(pdfn, + data_col='iSeq normpool volume', + well_col=well_col).astype(float) + +# Write the picklist as .csv +picklist_fp = input_sheet_filename + '.picklist.csv' + +if os.path.isfile(picklist_fp): + print("Warning! This file exists already.") + +picklist = format_pooling_echo_pick_list(vols, max_vol_per_well=30000) + +with open(picklist_fp, 'w') as f: + f.write(picklist) diff --git a/sequence_processing_pipeline/contrib/integrate-indices-np.py b/sequence_processing_pipeline/contrib/integrate-indices-np.py new file mode 100644 index 00000000..b1be83a6 --- /dev/null +++ b/sequence_processing_pipeline/contrib/integrate-indices-np.py @@ -0,0 +1,332 @@ +# Why +# 1) cloudspades requires the index reads be inline in the record header +# 2) Ariadne requires the data are sorted by the barcodes +# +# Inlining is easy. Sorting is complex as the amount of data is large, and +# the ordering stems is determined external to the data being sorted. To +# determine order, all barcodes must be read in to gather the complete +# barcode <-> record association; if only partial data is read then +# associations to barcodes may be missed, and we cannot perform an insertion +# sort efficiently as we're writing to disk. Once we know an order for the +# records, we (currently) read in the entirety of the subsequent data (R1 then +# R2), reorder, and write. Performing this in blocks to minimize memory may be +# possible, but we have to assume access is random as a grouping barcode +# may be with any record along the file. +# +# A variety of approaches were considered, including: +# - indexing portions in a hashtable, reading inputs multiple times, and +# writing in blocks. This was tested in both rust and python. The amount of +# memory was large, and keeping it under control would be many many many +# passes over data on disk or in memory +# - using pandas to do the grouping, which possibly avoids the memory burden +# of a hashmap. it didn't +# - using mmap files. No go, these are large and we have to walk over them +# a lot. +# +# Parsing this stuff adds a lot of overhead in Python. It will add some, if not +# a lot, in rust as well -- our test data had 65M sequences. So the current +# approach operates in the raw file data itself, using regex's to parse +# individual records. We use numpy for sorting and getting record orders. +# This is memory expensive but so far much less than the other approaches tried +# and it does not require multiple passes over files. We bottleneck on write +# IO, so to mitigate that, we are using a parallel gzip (pgzip), which still +# bottlenecks but gets better throughput. +# +# There probably are smarter ways to do this to reduce the memory burden. +# Right now, it's O(N) where N is the number of records. We load R1 and R2 +# separately though so we at least halve the memory use. As for doing it +# faster, at the moment we appear to saturate time on gzip. Easiest solution +# would be to increase the number of threads, but then again, this process +# is expected to run in an array, and filesystem can only take so much. +# +# In addition to the inline tests, md5 checks to verify all record IDs are +# present in both R1 / R2, and relative to original input. Spot checks on +# an arbitrary set of records were performed on R1 / R2 to verify no apparent +# unusual modification. And spot checks were performed to verify that correct +# barcodes are incorporating as expected in output. +# +# author: Daniel McDonald (d3mcdonald@eng.ucsd.edu) +import numpy as np +import click +import re +import io +import pgzip +import gzip + + +RECORD = re.compile(rb'@\S+\n[ATGCN]+\n\+\n\S+\n') +BARCODE = re.compile(rb'@\S+\n([ATGCN]+)\n\+\n\S+\n') + + +def gather_order(i1_in_fp): + """Determine record order + + This is a fancy way of saying: get all the barcodes, and sort them. + + We return the order of the sorted records, the unique barcodes, + and the bounds for what barcode associated with what record + """ + # determine barcode length + _ = i1_in_fp.readline() + b = i1_in_fp.readline() + rec_len = len(b.strip()) + i1_in_fp.seek(0) + + # we need larger data in memory later anyway... + i1 = i1_in_fp.read() + start = 0 + end = len(i1) + + # get the number of records. we completely assume non-multiline fastq here + newlines = i1.count(b'\n') + assert newlines % 4 == 0 + barcodes = np.empty(newlines // 4, dtype='|S%d' % rec_len) + + # walk all index records + # grab each barcode + idx = 0 + while start < end: + barcode_result = BARCODE.search(i1, pos=start) + barcode = barcode_result.groups()[0] + assert len(barcode) == rec_len # get angry if the barcode is weird + + barcodes[idx] = barcode + idx += 1 + start = barcode_result.end() + + # we no longer need the raw data so let's toss it + del i1 + + # determine the record order of a lexicographic sort + # gather the unique barcodes so we can use them later, and the bounding + # points in the sorted set + record_order = barcodes.argsort() + barcodes = barcodes[record_order] + unique_barcodes, barcode_bounds = np.unique(barcodes, return_index=True) + + return record_order, unique_barcodes, barcode_bounds + + +def test_gather_order(): + i1data = [b'@foo', b'ATGC', b'+', b'!!!!', + b'@bar', b'TTGG', b'+', b'!!!!', + b'@baz', b'ATGC', b'+', b'!!!!', + b'@oof', b'TTTT', b'+', b'!!!!', + b'@rab', b'TTGG', b'+', b'!!!!', + b'@zab', b'TTTT', b'+', b'!!!!', + b'@ofo', b'TTTT', b'+', b'!!!!', b''] + + i1 = io.BytesIO(b'\n'.join(i1data)) + order, unique, bounds = gather_order(i1) + + exp_order = np.array([0, 2, 1, 4, 3, 5, 6]) + exp_unique = np.array([b'ATGC', b'TTGG', b'TTTT']) + exp_bounds = np.array([0, 2, 4]) + + assert (order == exp_order).all() + assert (unique == exp_unique).all() + assert (bounds == exp_bounds).all() + + +def troll_and_write(order, unique, bounds, in_, out_): + """Walk over the raw data, spit out barcode amended records in order + + - read all data + - get index boundaries for each record + - pull out each record in order according to the barcode data + - associate the barcode + - write + """ + + data = in_.read() + boundaries = np.empty([order.size, 2], dtype=np.uint64) + + stop = 0 + for idx in range(order.size): + rec = RECORD.search(data, pos=stop) + start, stop = rec.span() + boundaries[idx] = np.array([start, stop], dtype=np.uint64) + + current_barcode_idx = 0 + current_barcode = unique[current_barcode_idx] + current_barcode_bound_end = bounds[current_barcode_idx + 1] + + for order_idx, record_idx in enumerate(order): + if order_idx >= current_barcode_bound_end: + current_barcode_idx += 1 + + if current_barcode_idx >= bounds.size: + raise ValueError("should not happen?") + current_barcode = unique[current_barcode_idx] + + if current_barcode_idx + 1 >= bounds.size: + # run to the end + current_barcode_bound_end = order.size + else: + current_barcode_bound_end = bounds[current_barcode_idx + 1] + + start, stop = boundaries[record_idx] + record = data[start:stop] + + # in a one-off, these might pass by chance. It would be real weird + # for them to always pass for all records in a large file. + # n.b., b'foo'[0] is int, because yay, so we use a slice to maintain + # a human readable character to test against as most mortals haven't + # memorized the ascii table + assert record[:1] == b'@' + assert record[-1:] == b'\n' + + with_barcode = insert_barcode(record, current_barcode) + out_.write(with_barcode) + + +def test_troll_and_write(): + i1data = [b'@foo', b'ATGC', b'+', b'!!!!', + b'@bar', b'TTGG', b'+', b'!!!!', + b'@baz', b'ATGC', b'+', b'!!!!', + b'@oof', b'TTTT', b'+', b'!!!!', + b'@rab', b'TTGG', b'+', b'!!!!', + b'@zab', b'TTTT', b'+', b'!!!!', + b'@ofo', b'TTTT', b'+', b'!!!!', b''] + + i1 = io.BytesIO(b'\n'.join(i1data)) + order, unique, bounds = gather_order(i1) + + # we assume records are in the same order, as that has previously been + # observed w/ tellread and is the normal expectation + r1data = [b'@foo', b'AATGC', b'+', b'!!!!!', + b'@bar', b'ATTGG', b'+', b'!!!!!', + b'@baz', b'AATGC', b'+', b'!!!!!', + b'@oof', b'ATTTT', b'+', b'!!!!!', + b'@rab', b'ATTGG', b'+', b'!!!!!', + b'@zab', b'ATTTT', b'+', b'!!!!!', + b'@ofo', b'ATTTT', b'+', b'!!!!!', b''] + r1 = io.BytesIO(b'\n'.join(r1data)) + r1out = io.BytesIO() + troll_and_write(order, unique, bounds, r1, r1out) + r1out.seek(0) + + r1exp = [b'@foo BX:Z:ATGC-1', b'AATGC', b'+', b'!!!!!', + b'@baz BX:Z:ATGC-1', b'AATGC', b'+', b'!!!!!', + b'@bar BX:Z:TTGG-1', b'ATTGG', b'+', b'!!!!!', + b'@rab BX:Z:TTGG-1', b'ATTGG', b'+', b'!!!!!', + b'@oof BX:Z:TTTT-1', b'ATTTT', b'+', b'!!!!!', + b'@zab BX:Z:TTTT-1', b'ATTTT', b'+', b'!!!!!', + b'@ofo BX:Z:TTTT-1', b'ATTTT', b'+', b'!!!!!', + b''] + r1exp = b'\n'.join(r1exp) + assert r1exp == r1out.read() + + +def create_tag(t): + return b'BX:Z:%s-1' % t + + +def create_tag_no_suffix(t): + return b'BX:Z:%s' % t + + +def insert_barcode(record, barcode): + """Get the current ID, smash the needed tag in""" + # @foo\nATGC\n+\n!!!!\n + id_, remainder = record.split(b'\n', 1) + tag = create_tag(barcode) + return b'%s %s\n%s' % (id_, tag, remainder) + + +def readfq(fp): + if fp.mode == 'rb': + strip = bytes.strip + else: + strip = str.strip + + id_ = iter(fp) + seq = iter(fp) + dumb = iter(fp) + qual = iter(fp) + for rec in zip(id_, seq, dumb, qual): + yield list(map(strip, rec)) + + +def writefq(rec, out): + for item in rec: + out.write(item) + out.write(b'\n') + + +@click.group() +def cli(): + pass + + +@cli.command() +def tests(): + test_gather_order() + test_troll_and_write() + + +@cli.command() +@click.option('--r1-in', type=click.Path(exists=True), required=True) +@click.option('--r2-in', type=click.Path(exists=True), required=True) +@click.option('--i1-in', type=click.Path(exists=True), required=True) +@click.option('--r1-out', type=click.Path(exists=False), required=True) +@click.option('--r2-out', type=click.Path(exists=False), required=True) +@click.option('--threads', type=int, required=False, default=1) +@click.option('--no-sort', is_flag=True, default=False) +def integrate(r1_in, r2_in, i1_in, r1_out, r2_out, threads, no_sort): + r1_in_fp = open(r1_in, 'rb') + r2_in_fp = open(r2_in, 'rb') + i1_in_fp = open(i1_in, 'rb') + + if no_sort: + r1_out_fp = gzip.open(r1_out, mode='wb') + r2_out_fp = gzip.open(r2_out, mode='wb') + + r1_sniff = r1_in_fp.readline().strip() + r2_sniff = r2_in_fp.readline().strip() + r1_in_fp.seek(0) + r2_in_fp.seek(0) + + # outputs from tellread don't seem to have orientation information + # some downstream programs hate this, so let's add if needed. + if r1_sniff.endswith(b'/1'): + if not r2_sniff.endswith(b'/2'): + raise ValueError('unexpected endings: ' + f'{r1_sniff.decode("utf-8")} ' + f'{r2_sniff.decode("utf-8")}') + orient_r1 = '' + orient_r2 = '' + else: + assert b'/1' not in r1_sniff + + orient_r1 = b'/1' + orient_r2 = b'/2' + + for (r1, r2, i1) in zip(*map(readfq, [r1_in_fp, r2_in_fp, i1_in_fp])): + assert r1[0] == r2[0] + assert r1[0] == i1[0] + + tag = create_tag_no_suffix(i1[1]) + r1[0] = b"%s%s %s" % (r1[0], orient_r1, tag) + r2[0] = b"%s%s %s" % (r2[0], orient_r2, tag) + writefq(r1, r1_out_fp) + writefq(r2, r2_out_fp) + r1_out_fp.close() + r2_out_fp.close() + else: + # 200MB is what they use in their readme... + r1_out_fp = pgzip.open(r1_out, mode='wb', thread=threads, + blocksize=2*10**8) + r2_out_fp = pgzip.open(r2_out, mode='wb', thread=threads, + blocksize=2*10**8) + + order, unique, bounds = gather_order(i1_in_fp) + + for in_, out_ in zip([r1_in_fp, r2_in_fp], [r1_out_fp, r2_out_fp]): + troll_and_write(order, unique, bounds, in_, out_) + in_.close() + out_.close() + + +if __name__ == '__main__': + cli() diff --git a/sequence_processing_pipeline/contrib/plot_counts.py b/sequence_processing_pipeline/contrib/plot_counts.py new file mode 100644 index 00000000..ecab9e49 --- /dev/null +++ b/sequence_processing_pipeline/contrib/plot_counts.py @@ -0,0 +1,27 @@ +import matplotlib.pyplot as plt +import re +import sys +import os +import pandas as pd + +ex = re.compile(r'_I1_(C5\d\d).fastq.gz.corrected.err_barcode_removed.fastq') + +# remove total line from wc +data = [x.strip().split(' ') for x in open(sys.argv[1])][:-1] +plotdata = [(ex.search(i).groups()[0], int(v) / 4) for v, i in data] +sheetdata = dict(plotdata) + +ordered = sorted(plotdata, key=lambda x: x[1]) +f = plt.figure(figsize=(16, 8)) +plt.bar([i for i, _ in ordered], [v for _, v in ordered]) +plt.ylabel('I1 reads') +plt.xticks(list(range(len(ordered))), [i for i, _ in ordered], rotation=90) +plt.savefig(sys.argv[3] + '/counts.pdf') + +sheet = pd.read_csv(sys.argv[2], dtype=str) +sheet = sheet[~sheet['Lane'].isnull()] +sheet['read_counts'] = [sheetdata[i] for i in sheet['Barcode_ID']] +name = os.path.basename(sys.argv[2]).rsplit('.', 1)[0] +newname = name + '.read_counts.tsv' + +sheet.to_csv(sys.argv[3] + '/' + newname, sep='\t', index=False, header=True) diff --git a/sequence_processing_pipeline/scripts/fake_squeue.py b/sequence_processing_pipeline/scripts/fake_squeue.py new file mode 100755 index 00000000..6c8511ce --- /dev/null +++ b/sequence_processing_pipeline/scripts/fake_squeue.py @@ -0,0 +1,101 @@ +#!/usr/bin/env python +from json import load, dumps +from os.path import exists, join +from sys import argv +from random import randint, choice + + +def print_state(state): + # Note that %i will appear w/column name 'JOBID' in actual squeue output. + # this is because %i shows the array-id if it's an array job and what we + # consider the regular job-id if it's not an array job. + print("JOBID,STATE") + for job_id in state: + if 'array_ids' in state[job_id]: + # this is an array job + for array_id in state[job_id]['array_ids']: + if state[job_id]['array_ids'][array_id] <= 0: + end_state = state[job_id]['endgame'][array_id] + else: + end_state = 'RUNNING' + + print(f"{array_id},{end_state}") + else: + # this is a non-array job + if state[job_id]['countdown'] <= 0: + end_state = state[job_id]['endgame'] + else: + end_state = 'RUNNING' + + print(f"{job_id},{end_state}") + + +def generate_output(job_ids): + results = {} + + for job_id in job_ids: + is_successful = choice([True, False]) + is_array_job = choice([True, False]) + + if is_array_job: + result = {'job_id': job_id} + result['array_ids'] = {} + result['endgame'] = {} + + for i in range(0, randint(5, 15)): + array_id = "%s_%d" % (job_id, i) + result['array_ids'][array_id] = randint(3, 7) + result['array_ids'][array_id] = randint(3, 7) + if is_successful: + # all array jobs must be successful + result['endgame'][array_id] = "COMPLETED" + else: + # some jobs may succeed but some may fail + result['endgame'][array_id] = choice( + ['COMPLETED', 'FAILED']) + results[job_id] = result + else: + result = {'job_id': job_id} + result['countdown'] = randint(3, 7) + result['endgame'] = choice(['COMPLETED', 'FAILED']) + results[job_id] = result + + return results + + +def save_state(state, file_path): + with open(file_path, 'w') as f: + print(dumps(state, indent=2), file=f) + + +def load_state(file_path): + with open(file_path, 'r') as f: + return load(f) + + +if __name__ == "__main__": + # "squeue -t all -j " f"{','.join(job_ids)} " "-o '%i,%T'" + job_ids = argv[4].split(',') + + state_file_path = join("sequence_processing_pipeline", "scripts", + "my_state.json") + + state = generate_output(job_ids) + + if exists(state_file_path): + state = load_state(state_file_path) + else: + state = generate_output(job_ids) + + print_state(state) + + for job_id in state: + if 'array_ids' in state[job_id]: + # this is an array job. + for array_id in state[job_id]['array_ids']: + state[job_id]['array_ids'][array_id] -= 1 + else: + # this is a standard job. + state[job_id]['countdown'] -= 1 + + save_state(state, state_file_path) diff --git a/sequence_processing_pipeline/templates/integrate.sbatch b/sequence_processing_pipeline/templates/integrate.sbatch new file mode 100644 index 00000000..68ebce5e --- /dev/null +++ b/sequence_processing_pipeline/templates/integrate.sbatch @@ -0,0 +1,67 @@ +#!/bin/bash -l +#SBATCH -J {{job_name}} +#SBATCH --time {{wall_time_limit}} +#SBATCH --mem {{mem_in_gb}}G +#SBATCH -N {{node_count}} +#SBATCH -c {{cores_per_task}} +#SBATCH -p {{queue_name}} +#SBATCH --array=1-{{barcode_id_count}} + +#SBATCH --output {{output_dir}}/logs/integrate_%x_%A_%a.out +#SBATCH --error {{output_dir}}/logs/integrate_%x_%A_%a.err + +set -x +set -e + +samples=($(cat {{output_dir}}/sample_index_list.txt | cut -f 2)) +sample=${samples[$((${SLURM_ARRAY_TASK_ID} - 1))]} + +export TMPDIR={{tmp_dir}} + +# get list of samples and determine which sample this array instance will work +# on. +samples=($(cat {{output_dir}}/sample_index_list.txt | cut -f 2)) +sample=${samples[$((${SLURM_ARRAY_TASK_ID} - 1))]} + +echo "Processing sample ${sample}..." + +# make temp directory +export TMPDIR={{tmp_dir}} +mkdir -p $TMPDIR + + +# TODO: All three input files must be non-zero in length. +# If possible, do this check as part of normal FSR operation. +# Previously this was done right here BEFORE integrating, rather +# than after. + +# NB: non-zero file-length check removed for now. This should be performed +# by FSR after processing is done. +# TODO: Make sure raw_fastq_dir is TellReadJob/Full +r1_in={{raw_fastq_dir}}/TellReadJob_R1_${sample}.fastq.gz.corrected.err_barcode_removed.fastq +r2_in={{raw_fastq_dir}}/TellReadJob_R2_${sample}.fastq.gz.corrected.err_barcode_removed.fastq +i1_in={{raw_fastq_dir}}/TellReadJob_I1_${sample}.fastq.gz.corrected.err_barcode_removed.fastq + +# create output directory +mkdir -p {{output_dir}}/integrated + +# generate output file names +r1_out={{output_dir}}/integrated/${sample}.R1.fastq.gz +r2_out={{output_dir}}/integrated/${sample}.R2.fastq.gz +i1_out={{output_dir}}/integrated/${sample}.I1.fastq.gz + +# generate 'integrated' I1 fastq.gz file. We do this as part of each array so +# they're done in parallel. +gzip -c ${i1_in} > ${i1_out} + +# generate integrated R1 and R2 fastq.gz files. +conda activate qp-knight-lab-processing-2022.03 + +python {{integrate_script_path}} integrate \ +--no-sort \ +--r1-in ${r1_in} \ +--r2-in ${r2_in} \ +--i1-in ${i1_in} \ +--r1-out ${r1_out} \ +--r2-out ${r2_out} \ +--threads {{cores_per_task}} diff --git a/sequence_processing_pipeline/templates/seq_counts.sbatch b/sequence_processing_pipeline/templates/seq_counts.sbatch new file mode 100644 index 00000000..f44bd5b9 --- /dev/null +++ b/sequence_processing_pipeline/templates/seq_counts.sbatch @@ -0,0 +1,25 @@ +#!/bin/bash -l +#SBATCH -J {{job_name}} +#SBATCH --time {{wall_time_limit}} +#SBATCH --mem {{mem_in_gb}}G +#SBATCH -N {{node_count}} +#SBATCH -c {{cores_per_task}} +#SBATCH -p {{queue_name}} +#SBATCH --array=1-{{file_count}} + +#SBATCH --output {{output_path}}/logs/%x_%A_%a.out +#SBATCH --error {{output_path}}/logs/%x_%A_%a.err + +set -x +set -e + +mkdir -p {{output_path}}/logs + +files=($(cat {{output_path}}/files_to_count.txt)) +my_file=${files[$((${SLURM_ARRAY_TASK_ID} - 1))]} + +echo "${my_file}" + +conda activate qp-knight-lab-processing-2022.03 + +seqtk size ${my_file} diff --git a/sequence_processing_pipeline/templates/tellread-cleanup.sbatch b/sequence_processing_pipeline/templates/tellread-cleanup.sbatch new file mode 100644 index 00000000..3c31219d --- /dev/null +++ b/sequence_processing_pipeline/templates/tellread-cleanup.sbatch @@ -0,0 +1,13 @@ +#!/bin/bash -l +#SBATCH -J {{job_name}} # cleanup +#SBATCH --time {{wall_time_limit}} # 24:00:00 +#SBATCH --mem {{mem_in_gb}}G # 8G +#SBATCH -N {{node_count}} # 1 +#SBATCH -c {{cores_per_task}} # 1 +#SBATCH -p {{queue_name}} # qiita + +#SBATCH --output {{output}}/logs/cleanup_%x-%A.out +#SBATCH --error {{output}}/logs/cleanup_%x-%A.err + +# remove unused large outputs +rm -rf {{OUTPUT}}/biosample_format {{OUTPUT}}/1_demult {{OUTPUT}}/Full diff --git a/sequence_processing_pipeline/templates/tellread.sbatch b/sequence_processing_pipeline/templates/tellread.sbatch new file mode 100644 index 00000000..f038a568 --- /dev/null +++ b/sequence_processing_pipeline/templates/tellread.sbatch @@ -0,0 +1,48 @@ +#!/bin/bash -l +#SBATCH -J {{job_name}} +#SBATCH -p {{queue_name}} +#SBATCH -N {{node_count}} +#SBATCH -c {{cores_per_task}} +#SBATCH --mem {{mem_in_gb}}G +#SBATCH --time {{wall_time_limit}} + +#SBATCH --output {{output}}/logs/tellread_%x-%A.out +#SBATCH --error {{output}}/logs/tellread_%x-%A.err + +set -x + +module load {{modules_to_load}} +{{sing_script_path}} \ + -i {{rundir_path}} \ + -o {{output}} \ + -s $(echo {{samples}} | tr -d '"') \ + -g $(echo {{refs}} | tr -d '"') \ + -j ${SLURM_JOB_CPUS_PER_NODE} {{extra}} \ + -l {{lane}} + +# get the timestamp for the most recently changed file in directory '.' + +# hard-limit for wait time set to ~ 8 hours. +# (4 checks per hour, for 8 hours equals 32 iterations) +for i in $(seq 1 32); +do + before="$(find {{output}}/Full -type f -printf '%T@\n' | sort -n | tail -1)" + # assume TellReadJob is finished if ctime hasn't changed in 15 minutes + # for any fastq file in the directory. + sleep 900 + after="$(find {{output}}/Full -type f -printf '%T@\n' | sort -n | tail -1)" + + echo "$before $after" + + if [[ "$before" == "$after" ]]; then + echo "DONE" + exit 0 + else + echo "NOT DONE" + fi +done + +# if we've reached this point then we've exceeded our hard-limit for waiting. +# return w/an error. +exit 1 + diff --git a/sequence_processing_pipeline/tests/data/20230906_FS10001773_68_BTR67708-1611.csv b/sequence_processing_pipeline/tests/data/20230906_FS10001773_68_BTR67708-1611.csv new file mode 100644 index 00000000..f696f0c9 --- /dev/null +++ b/sequence_processing_pipeline/tests/data/20230906_FS10001773_68_BTR67708-1611.csv @@ -0,0 +1,41 @@ +Sample_ID,Sample_Name,Sample_Plate,Sample_Well,Barcode_96_Well_Position,Barcode_ID,Sample_Project,Well_description,Lane +Person.A.TELLSEQ.R20.microbe,Person.A.TELLSEQ.R20.microbe,TellSeq3_15196_P3,A1,A4,C525,TellSeq3_15196_P3,Person.A.TELLSEQ.R20.microbe,1 +Person.B.TELLSEQ.R24.microbe,Person.B.TELLSEQ.R24.microbe,TellSeq3_15196_P3,B1,B4,C526,TellSeq3_15196_P3,Person.B.TELLSEQ.R24.microbe,1 +Person.C.TELLSEQ.R21.microbe,Person.C.TELLSEQ.R21.microbe,TellSeq3_15196_P3,C1,C4,C527,TellSeq3_15196_P3,Person.C.TELLSEQ.R21.microbe,1 +Person.D.TELLSEQ.R26.microbe,Person.D.TELLSEQ.R26.microbe,TellSeq3_15196_P3,D1,D4,C528,TellSeq3_15196_P3,Person.D.TELLSEQ.R26.microbe,1 +Person.E.TELLSEQ.R19.microbe,Person.E.TELLSEQ.R19.microbe,TellSeq3_15196_P3,E1,E4,C529,TellSeq3_15196_P3,Person.E.TELLSEQ.R19.microbe,1 +Pet.C.TELLSEQ.R23.microbe,Pet.C.TELLSEQ.R23.microbe,TellSeq3_15196_P3,F1,F4,C530,TellSeq3_15196_P3,Pet.C.TELLSEQ.R23.microbe,1 +BLANK.TELLSEQ.3.12.H.microbe,BLANK.TELLSEQ.3.12.H.microbe,TellSeq3_15196_P3,G1,G4,C531,TellSeq3_15196_P3,BLANK.TELLSEQ.3.12.H.microbe,1 +Isolate.115.R1.microbe,Isolate.115.R1.microbe,TellSeq3_15196_P1,H1,H4,C532,TellSeq3_15196_P3,Isolate.115.R1.microbe,1 +Zymo.Mock.Community.R1.microbe,Zymo.Mock.Community.R1.microbe,TellSeq3_15196_P1,A2,A5,C533,TellSeq3_15196_P3,Zymo.Mock.Community.R1.microbe,1 +E.coli.QC.DNA.R1.microbe,E.coli.QC.DNA.R1.microbe,TellSeq3_15196_P1,B2,B5,C534,TellSeq3_15196_P3,E.coli.QC.DNA.R1.microbe,1 +Person.A.TELLSEQ.R20.purified.microbe,Person.A.TELLSEQ.R20.purified.microbe,TellSeq3_15196_P3,C2,C5,C535,TellSeq3_15196_P3,Person.A.TELLSEQ.R20.purified.microbe,1 +Person.B.TELLSEQ.R24.purified.microbe,Person.B.TELLSEQ.R24.purified.microbe,TellSeq3_15196_P3,D2,D5,C536,TellSeq3_15196_P3,Person.B.TELLSEQ.R24.purified.microbe,1 +Person.C.TELLSEQ.R21.purified.microbe,Person.C.TELLSEQ.R21.purified.microbe,TellSeq3_15196_P3,E2,E5,C537,TellSeq3_15196_P3,Person.C.TELLSEQ.R21.purified.microbe,1 +Person.D.TELLSEQ.R26.purified.microbe,Person.D.TELLSEQ.R26.purified.microbe,TellSeq3_15196_P3,F2,F5,C538,TellSeq3_15196_P3,Person.D.TELLSEQ.R26.purified.microbe,1 +Person.E.TELLSEQ.R19.purified.microbe,Person.E.TELLSEQ.R19.purified.microbe,TellSeq3_15196_P3,G2,G5,C539,TellSeq3_15196_P3,Person.E.TELLSEQ.R19.purified.microbe,1 +Pet.C.TELLSEQ.R23.purified.microbe,Pet.C.TELLSEQ.R23.purified.microbe,TellSeq3_15196_P3,H2,H5,C540,TellSeq3_15196_P3,Pet.C.TELLSEQ.R23.purified.microbe,1 +BLANK.TELLSEQ.3.12.H.purified.microbe,BLANK.TELLSEQ.3.12.H.purified.microbe,TellSeq3_15196_P3,A3,A6,C541,TellSeq3_15196_P3,BLANK.TELLSEQ.3.12.H.purified.microbe,1 +Isolate.115.R2.microbe,Isolate.115.R2.microbe,TellSeq3_15196_P1,B3,B6,C542,TellSeq3_15196_P3,Isolate.115.R2.microbe,1 +Zymo.Mock.Community.R2.microbe,Zymo.Mock.Community.R2.microbe,TellSeq3_15196_P1,C3,C6,C543,TellSeq3_15196_P3,Zymo.Mock.Community.R2.microbe,1 +E.coli.QC.DNA.R2.microbe,E.coli.QC.DNA.R2.microbe,TellSeq3_15196_P1,D3,D6,C544,TellSeq3_15196_P3,E.coli.QC.DNA.R2.microbe,1 +Person.A.TELLSEQ.R20.std,Person.A.TELLSEQ.R20.std,TellSeq3_15196_P3,A1,A1,C501,TellSeq3_15196,Person.A.TELLSEQ.R20.std,1 +Person.B.TELLSEQ.R24.std,Person.B.TELLSEQ.R24.std,TellSeq3_15196_P3,B1,B1,C502,TellSeq3_15196,Person.B.TELLSEQ.R24.std,1 +Person.C.TELLSEQ.R21.std,Person.C.TELLSEQ.R21.std,TellSeq3_15196_P3,C1,C1,C503,TellSeq3_15196,Person.C.TELLSEQ.R21.std,1 +Person.D.TELLSEQ.R26.std,Person.D.TELLSEQ.R26.std,TellSeq3_15196_P3,D1,D1,C504,TellSeq3_15196,Person.D.TELLSEQ.R26.std,1 +Person.E.TELLSEQ.R19.std,Person.E.TELLSEQ.R19.std,TellSeq3_15196_P3,E1,E1,C505,TellSeq3_15196,Person.E.TELLSEQ.R19.std,1 +Pet.C.TELLSEQ.R23.std,Pet.C.TELLSEQ.R23.std,TellSeq3_15196_P3,F1,F1,C506,TellSeq3_15196,Pet.C.TELLSEQ.R23.std,1 +BLANK.TELLSEQ.3.12.H.std,BLANK.TELLSEQ.3.12.H.std,TellSeq3_15196_P3,G1,G1,C507,TellSeq3_15196,BLANK.TELLSEQ.3.12.H.std,1 +Isolate.115.R1.std,Isolate.115.R1.std,TellSeq3_15196_P1,H1,H1,C508,TellSeq3_15196,Isolate.115.R1.std,1 +Zymo.Mock.Community.R1.std,Zymo.Mock.Community.R1.std,TellSeq3_15196_P1,A2,A2,C509,TellSeq3_15196,Zymo.Mock.Community.R1.std,1 +E.coli.QC.DNA.R1.std,E.coli.QC.DNA.R1.std,TellSeq3_15196_P1,B2,B2,C510,TellSeq3_15196,E.coli.QC.DNA.R1.std,1 +Person.A.TELLSEQ.R20.purified.std,Person.A.TELLSEQ.R20.purified.std,TellSeq3_15196_P3,C2,C2,C511,TellSeq3_15196,Person.A.TELLSEQ.R20.purified.std,1 +Person.B.TELLSEQ.R24.purified.std,Person.B.TELLSEQ.R24.purified.std,TellSeq3_15196_P3,D2,D2,C512,TellSeq3_15196,Person.B.TELLSEQ.R24.purified.std,1 +Person.C.TELLSEQ.R21.purified.std,Person.C.TELLSEQ.R21.purified.std,TellSeq3_15196_P3,E2,E2,C513,TellSeq3_15196,Person.C.TELLSEQ.R21.purified.std,1 +Person.D.TELLSEQ.R26.purified.std,Person.D.TELLSEQ.R26.purified.std,TellSeq3_15196_P3,F2,F2,C514,TellSeq3_15196,Person.D.TELLSEQ.R26.purified.std,1 +Person.E.TELLSEQ.R19.purified.std,Person.E.TELLSEQ.R19.purified.std,TellSeq3_15196_P3,G2,G2,C515,TellSeq3_15196,Person.E.TELLSEQ.R19.purified.std,1 +Pet.C.TELLSEQ.R23.purified.std,Pet.C.TELLSEQ.R23.purified.std,TellSeq3_15196_P3,H2,H2,C516,TellSeq3_15196,Pet.C.TELLSEQ.R23.purified.std,1 +BLANK.TELLSEQ.3.12.H.purified.std,BLANK.TELLSEQ.3.12.H.purified.std,TellSeq3_15196_P3,A3,A3,C517,TellSeq3_15196,BLANK.TELLSEQ.3.12.H.purified.std,1 +Isolate.115.R2.std,Isolate.115.R2.std,TellSeq3_15196_P1,B3,B3,C518,TellSeq3_15196,Isolate.115.R2.std,1 +Zymo.Mock.Community.R2.std,Zymo.Mock.Community.R2.std,TellSeq3_15196_P1,C3,C3,C519,TellSeq3_15196,Zymo.Mock.Community.R2.std,1 +E.coli.QC.DNA.R2.std,E.coli.QC.DNA.R2.std,TellSeq3_15196_P1,D3,D3,C520,TellSeq3_15196,E.coli.QC.DNA.R2.std,1 diff --git a/sequence_processing_pipeline/tests/data/aggregate_counts_results.json b/sequence_processing_pipeline/tests/data/aggregate_counts_results.json new file mode 100644 index 00000000..1cae0f05 --- /dev/null +++ b/sequence_processing_pipeline/tests/data/aggregate_counts_results.json @@ -0,0 +1,36 @@ +{ + "REMOVED/8edbdee2-da52-4278-af40-267185bbcd7e/TellReadJob/Full": { + "TellReadJob_I1_C520.fastq.gz.erroneous.fastq": { + "seq_counts": 2139633, + "base_pairs": 38513394 + }, + "TellReadJob_R1_C519.fastq.gz.corrected.err_barcode_removed.fastq": { + "seq_counts": 64464162, + "base_pairs": 8345327641 + }, + "TellReadJob_R1_C520.fastq.gz.corrected.err_barcode_removed.fastq": { + "seq_counts": 70399028, + "base_pairs": 9293296513 + }, + "TellReadJob_I1_C519.fastq.gz.erroneous.fastq": { + "seq_counts": 1932116, + "base_pairs": 34778088 + }, + "TellReadJob_I1_C519.fastq.gz.corrected.err_barcode_removed.fastq": { + "seq_counts": 64464162, + "base_pairs": 1160354916 + }, + "TellReadJob_R2_C519.fastq.gz.corrected.err_barcode_removed.fastq": { + "seq_counts": 64464162, + "base_pairs": 8370238082 + }, + "TellReadJob_R2_C520.fastq.gz.corrected.err_barcode_removed.fastq": { + "seq_counts": 70399028, + "base_pairs": 9317943166 + }, + "TellReadJob_I1_C520.fastq.gz.corrected.err_barcode_removed.fastq": { + "seq_counts": 70399028, + "base_pairs": 1267182504 + } + } +} diff --git a/sequence_processing_pipeline/tests/data/configuration_profiles/iseq_metagenomic.json b/sequence_processing_pipeline/tests/data/configuration_profiles/iseq_metagenomic.json index 089e82f1..c82c76b0 100644 --- a/sequence_processing_pipeline/tests/data/configuration_profiles/iseq_metagenomic.json +++ b/sequence_processing_pipeline/tests/data/configuration_profiles/iseq_metagenomic.json @@ -3,6 +3,25 @@ "instrument_type": "iseq", "assay_type": "Metagenomic", "configuration": { + "tell-seq": { + "label": "my_label", + "reference_base": "", + "reference_map": "", + "sing_script_path": "/my_path/tellread-release-novaseqX/run_tellread_sing.sh", + "nodes": 1, + "lane": 1, + "sample_index_list": "/my_path/sample_index_list_1.txt", + "queue": "qiita", + "wallclock_time_in_minutes": 1440, + "modules_to_load": ["singularity_3.6.4"], + "integrate_script_path": "/my_path/integrate-indices-np.py", + "tellread_mem_limit": "16", + "tellread_cores": "4", + "normcount_cores": "1", + "integrate_cores": "1", + "normcount_mem_limit": "8", + "integrate_mem_limit": "8" + }, "bcl2fastq": { "nodes": 1, "nprocs": 16, diff --git a/sequence_processing_pipeline/tests/data/fake_sample_index_list.txt b/sequence_processing_pipeline/tests/data/fake_sample_index_list.txt new file mode 100644 index 00000000..1c4345ef --- /dev/null +++ b/sequence_processing_pipeline/tests/data/fake_sample_index_list.txt @@ -0,0 +1,96 @@ +CCCCCACCAA C501 NONE PE +AACCCCCACA C502 NONE PE +CCAACACACC C503 NONE PE +AACACCCCCA C504 NONE PE +CAAAACCCCC C505 NONE PE +ACACACCACC C506 NONE PE +AACCCACACC C507 NONE PE +CAAAAAAAAA C508 NONE PE +AAACCACCCC C509 NONE PE +ACACCCCCCC C510 NONE PE +AAACACCACA C511 NONE PE +CAAAACCCCA C512 NONE PE +ACCCCAACCC C513 NONE PE +CAACCAACAC C514 NONE PE +CCCCCACCCA C515 NONE PE +CCAACCCCCA C516 NONE PE +CAAAACACCC C517 NONE PE +ACACACCAAA C518 NONE PE +CCACCAAAAA C519 NONE PE +AAACCCCCCC C520 NONE PE +AACAACCCCA C521 NONE PE +CAACAACAAC C522 NONE PE +CACCACCAAA C523 NONE PE +CACAACAAAC C524 NONE PE +AACACCCACC C525 NONE PE +CAAAACACAA C526 NONE PE +AAAAAAAAAA C527 NONE PE +CCAACCCCCA C528 NONE PE +CAACCCCAAA C529 NONE PE +ACCCAACCCA C530 NONE PE +CACACCAAAC C531 NONE PE +CAACAAAAAC C532 NONE PE +CCAAAAAAAC C533 NONE PE +ACCAACACAC C534 NONE PE +CCAAAACACC C535 NONE PE +CACCCAAACC C536 NONE PE +CAAACCAAAC C537 NONE PE +CACAAACACA C538 NONE PE +ACCCACCCCC C539 NONE PE +AACCACACAC C540 NONE PE +CACCCCCACA C541 NONE PE +CACAACCACC C542 NONE PE +AAAAACAAAA C543 NONE PE +CCACACAAAC C544 NONE PE +AAACAAACAC C545 NONE PE +ACCCCCACCC C546 NONE PE +ACACCCCAAA C547 NONE PE +CAAACCAAAC C548 NONE PE +AAACCACCAA C549 NONE PE +CAACCAAAAC C550 NONE PE +ACACCCCCCC C551 NONE PE +CACCCACCAC C552 NONE PE +ACCCCAACCC C553 NONE PE +AAACCCAACA C554 NONE PE +ACCACACAAA C555 NONE PE +ACCCACCACC C556 NONE PE +CCCCAACCAA C557 NONE PE +CAAAAACACC C558 NONE PE +ACCACAAAAC C559 NONE PE +ACCCCCCCAA C560 NONE PE +CCACAAAACA C561 NONE PE +CAAACCCACC C562 NONE PE +ACACCACAAA C563 NONE PE +ACCAACAAAA C564 NONE PE +CCCAACAAAA C565 NONE PE +CACCCCCCCA C566 NONE PE +AAACCCACCA C567 NONE PE +CACACCACAA C568 NONE PE +CCAAACCCCA C569 NONE PE +CACCCCACCC C570 NONE PE +AAACCCCCAA C571 NONE PE +ACACCACACC C572 NONE PE +ACAAAACACC C573 NONE PE +CACAAACCAC C574 NONE PE +ACCCCCACAA C575 NONE PE +CCCCAAACCC C576 NONE PE +AAAACACAAC C577 NONE PE +AACCCAACCA C578 NONE PE +AAACACACAC C579 NONE PE +AAACACCACC C580 NONE PE +AACCCCAACA C581 NONE PE +CCACCCAAAC C582 NONE PE +CCAAAACAAC C583 NONE PE +ACCAACAAAC C584 NONE PE +AAACACCACC C585 NONE PE +AACCACACAC C586 NONE PE +CACAACAAAA C587 NONE PE +AACCAAAAAC C588 NONE PE +ACCAAACCAA C589 NONE PE +ACAAACACAC C590 NONE PE +ACCACACCAA C591 NONE PE +AAAAACAACC C592 NONE PE +CACACAACAC C593 NONE PE +CCCCCAACCC C594 NONE PE +ACACAAAACC C595 NONE PE +CCACCACACC C596 NONE PE diff --git a/sequence_processing_pipeline/tests/data/files_to_count.txt b/sequence_processing_pipeline/tests/data/files_to_count.txt new file mode 100644 index 00000000..8d7ce4b1 --- /dev/null +++ b/sequence_processing_pipeline/tests/data/files_to_count.txt @@ -0,0 +1,8 @@ +/ddn_scratch/qiita_t/working_dir/8edbdee2-da52-4278-af40-267185bbcd7e/TellReadJob/Full/TellReadJob_I1_C519.fastq.gz.corrected.err_barcode_removed.fastq +/ddn_scratch/qiita_t/working_dir/8edbdee2-da52-4278-af40-267185bbcd7e/TellReadJob/Full/TellReadJob_I1_C519.fastq.gz.erroneous.fastq +/ddn_scratch/qiita_t/working_dir/8edbdee2-da52-4278-af40-267185bbcd7e/TellReadJob/Full/TellReadJob_I1_C520.fastq.gz.corrected.err_barcode_removed.fastq +/ddn_scratch/qiita_t/working_dir/8edbdee2-da52-4278-af40-267185bbcd7e/TellReadJob/Full/TellReadJob_I1_C520.fastq.gz.erroneous.fastq +/ddn_scratch/qiita_t/working_dir/8edbdee2-da52-4278-af40-267185bbcd7e/TellReadJob/Full/TellReadJob_R1_C519.fastq.gz.corrected.err_barcode_removed.fastq +/ddn_scratch/qiita_t/working_dir/8edbdee2-da52-4278-af40-267185bbcd7e/TellReadJob/Full/TellReadJob_R1_C520.fastq.gz.corrected.err_barcode_removed.fastq +/ddn_scratch/qiita_t/working_dir/8edbdee2-da52-4278-af40-267185bbcd7e/TellReadJob/Full/TellReadJob_R2_C519.fastq.gz.corrected.err_barcode_removed.fastq +/ddn_scratch/qiita_t/working_dir/8edbdee2-da52-4278-af40-267185bbcd7e/TellReadJob/Full/TellReadJob_R2_C520.fastq.gz.corrected.err_barcode_removed.fastq diff --git a/sequence_processing_pipeline/tests/data/seq_counts.sbatch b/sequence_processing_pipeline/tests/data/seq_counts.sbatch new file mode 100644 index 00000000..cc73187c --- /dev/null +++ b/sequence_processing_pipeline/tests/data/seq_counts.sbatch @@ -0,0 +1,25 @@ +#!/bin/bash -l +#SBATCH -J seq_counts +#SBATCH --time 1440 +#SBATCH --mem 8G +#SBATCH -N 1 +#SBATCH -c 1 +#SBATCH -p qiita +#SBATCH --array=1-8 + +#SBATCH --output sequence_processing_pipeline/tests/2caa8226-cf69-45a3-bd40-1e90ec3d18d0/SeqCountsJob/logs/%x_%A_%a.out +#SBATCH --error sequence_processing_pipeline/tests/2caa8226-cf69-45a3-bd40-1e90ec3d18d0/SeqCountsJob/logs/%x_%A_%a.err + +set -x +set -e + +mkdir -p sequence_processing_pipeline/tests/2caa8226-cf69-45a3-bd40-1e90ec3d18d0/SeqCountsJob/logs + +files=($(cat sequence_processing_pipeline/tests/2caa8226-cf69-45a3-bd40-1e90ec3d18d0/SeqCountsJob/files_to_count.txt)) +my_file=${files[$((${SLURM_ARRAY_TASK_ID} - 1))]} + +echo "${my_file}" + +conda activate qp-knight-lab-processing-2022.03 + +seqtk size ${my_file} diff --git a/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_1.err b/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_1.err new file mode 100644 index 00000000..47c59651 --- /dev/null +++ b/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_1.err @@ -0,0 +1,3 @@ +This is an example .err file produced by seq_counts.sbatch. +Additional details removed. ++ seqtk size REMOVED/working_dir/8edbdee2-da52-4278-af40-267185bbcd7e/TellReadJob/Full/TellReadJob_R1_C519.fastq.gz.corrected.err_barcode_removed.fastq diff --git a/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_1.out b/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_1.out new file mode 100644 index 00000000..50a46674 --- /dev/null +++ b/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_1.out @@ -0,0 +1,2 @@ +REMOVED/8edbdee2-da52-4278-af40-267185bbcd7e/TellReadJob/Full/TellReadJob_R1_C519.fastq.gz.corrected.err_barcode_removed.fastq +64464162 8345327641 diff --git a/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_2.err b/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_2.err new file mode 100644 index 00000000..47c59651 --- /dev/null +++ b/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_2.err @@ -0,0 +1,3 @@ +This is an example .err file produced by seq_counts.sbatch. +Additional details removed. ++ seqtk size REMOVED/working_dir/8edbdee2-da52-4278-af40-267185bbcd7e/TellReadJob/Full/TellReadJob_R1_C519.fastq.gz.corrected.err_barcode_removed.fastq diff --git a/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_2.out b/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_2.out new file mode 100644 index 00000000..87ad9f55 --- /dev/null +++ b/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_2.out @@ -0,0 +1,2 @@ +REMOVED/8edbdee2-da52-4278-af40-267185bbcd7e/TellReadJob/Full/TellReadJob_R1_C520.fastq.gz.corrected.err_barcode_removed.fastq +70399028 9293296513 diff --git a/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_3.err b/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_3.err new file mode 100644 index 00000000..e9c0cf9d --- /dev/null +++ b/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_3.err @@ -0,0 +1,3 @@ +This is an example .err file produced by seq_counts.sbatch. +Additional details removed. ++ seqtk size REMOVED/working_dir/8edbdee2-da52-4278-af40-267185bbcd7e/TellReadJob/Full/TellReadJob_R1_C520.fastq.gz.corrected.err_barcode_removed.fastq diff --git a/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_3.out b/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_3.out new file mode 100644 index 00000000..a22d9f8d --- /dev/null +++ b/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_3.out @@ -0,0 +1,2 @@ +REMOVED/8edbdee2-da52-4278-af40-267185bbcd7e/TellReadJob/Full/TellReadJob_I1_C519.fastq.gz.erroneous.fastq +1932116 34778088 diff --git a/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_4.err b/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_4.err new file mode 100644 index 00000000..e9c0cf9d --- /dev/null +++ b/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_4.err @@ -0,0 +1,3 @@ +This is an example .err file produced by seq_counts.sbatch. +Additional details removed. ++ seqtk size REMOVED/working_dir/8edbdee2-da52-4278-af40-267185bbcd7e/TellReadJob/Full/TellReadJob_R1_C520.fastq.gz.corrected.err_barcode_removed.fastq diff --git a/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_4.out b/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_4.out new file mode 100644 index 00000000..0b35614a --- /dev/null +++ b/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_4.out @@ -0,0 +1,2 @@ +REMOVED/8edbdee2-da52-4278-af40-267185bbcd7e/TellReadJob/Full/TellReadJob_R2_C520.fastq.gz.corrected.err_barcode_removed.fastq +70399028 9317943166 diff --git a/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_5.err b/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_5.err new file mode 100644 index 00000000..47c59651 --- /dev/null +++ b/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_5.err @@ -0,0 +1,3 @@ +This is an example .err file produced by seq_counts.sbatch. +Additional details removed. ++ seqtk size REMOVED/working_dir/8edbdee2-da52-4278-af40-267185bbcd7e/TellReadJob/Full/TellReadJob_R1_C519.fastq.gz.corrected.err_barcode_removed.fastq diff --git a/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_5.out b/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_5.out new file mode 100644 index 00000000..887522ae --- /dev/null +++ b/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_5.out @@ -0,0 +1,2 @@ +REMOVED/8edbdee2-da52-4278-af40-267185bbcd7e/TellReadJob/Full/TellReadJob_I1_C520.fastq.gz.corrected.err_barcode_removed.fastq +70399028 1267182504 diff --git a/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_6.err b/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_6.err new file mode 100644 index 00000000..e9c0cf9d --- /dev/null +++ b/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_6.err @@ -0,0 +1,3 @@ +This is an example .err file produced by seq_counts.sbatch. +Additional details removed. ++ seqtk size REMOVED/working_dir/8edbdee2-da52-4278-af40-267185bbcd7e/TellReadJob/Full/TellReadJob_R1_C520.fastq.gz.corrected.err_barcode_removed.fastq diff --git a/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_6.out b/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_6.out new file mode 100644 index 00000000..a4fbd555 --- /dev/null +++ b/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_6.out @@ -0,0 +1,2 @@ +REMOVED/8edbdee2-da52-4278-af40-267185bbcd7e/TellReadJob/Full/TellReadJob_R2_C519.fastq.gz.corrected.err_barcode_removed.fastq +64464162 8370238082 diff --git a/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_7.err b/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_7.err new file mode 100644 index 00000000..47c59651 --- /dev/null +++ b/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_7.err @@ -0,0 +1,3 @@ +This is an example .err file produced by seq_counts.sbatch. +Additional details removed. ++ seqtk size REMOVED/working_dir/8edbdee2-da52-4278-af40-267185bbcd7e/TellReadJob/Full/TellReadJob_R1_C519.fastq.gz.corrected.err_barcode_removed.fastq diff --git a/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_7.out b/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_7.out new file mode 100644 index 00000000..6c6a9c06 --- /dev/null +++ b/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_7.out @@ -0,0 +1,2 @@ +REMOVED/8edbdee2-da52-4278-af40-267185bbcd7e/TellReadJob/Full/TellReadJob_I1_C519.fastq.gz.corrected.err_barcode_removed.fastq +64464162 1160354916 diff --git a/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_8.err b/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_8.err new file mode 100644 index 00000000..e9c0cf9d --- /dev/null +++ b/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_8.err @@ -0,0 +1,3 @@ +This is an example .err file produced by seq_counts.sbatch. +Additional details removed. ++ seqtk size REMOVED/working_dir/8edbdee2-da52-4278-af40-267185bbcd7e/TellReadJob/Full/TellReadJob_R1_C520.fastq.gz.corrected.err_barcode_removed.fastq diff --git a/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_8.out b/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_8.out new file mode 100644 index 00000000..9be52329 --- /dev/null +++ b/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_8.out @@ -0,0 +1,2 @@ +REMOVED/8edbdee2-da52-4278-af40-267185bbcd7e/TellReadJob/Full/TellReadJob_I1_C520.fastq.gz.erroneous.fastq +2139633 38513394 diff --git a/sequence_processing_pipeline/tests/data/tellseq_metag_dummy_sample_sheet.csv b/sequence_processing_pipeline/tests/data/tellseq_metag_dummy_sample_sheet.csv new file mode 100644 index 00000000..105330fd --- /dev/null +++ b/sequence_processing_pipeline/tests/data/tellseq_metag_dummy_sample_sheet.csv @@ -0,0 +1,135 @@ +[Header],,,,,,,, +IEMFileVersion,1,,,,,,, +SheetType,tellseq_metag,,,,,,, +SheetVersion,10,,,,,,, +Investigator Name,Knight,,,,,,, +Experiment Name,RKL0151,,,,,,, +Date,5/6/24,,,,,,, +Workflow,GenerateFASTQ,,,,,,, +Application,FASTQ Only,,,,,,, +Assay,Metagenomic,,,,,,, +Description,,,,,,,, +Chemistry,Default,,,,,,, +,,,,,,,, +[Reads],,,,,,,, +151,,,,,,,, +151,,,,,,,, +,,,,,,,, +[Settings],,,,,,,, +ReverseComplement,0,,,,,,, +,,,,,,,, +[Data],,,,,,,, +Sample_ID,Sample_Name,Sample_Plate,well_id_384,barcode_id,Sample_Project,Well_description,Lane, +LS_8_10_2013_SRE,LS.8.10.2013.SRE,LS_Donor_SS_Samples_P1,A1,C501,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.8.10.2013.SRE,1, +LS_12_17_2014_SRE,LS.12.17.2014.SRE,LS_Donor_SS_Samples_P1,B1,C509,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.12.17.2014.SRE,1, +LS_4_4_2015_SRE,LS.4.4.2015.SRE,LS_Donor_SS_Samples_P1,C1,C502,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.4.4.2015.SRE,1, +LS_2_23_2015_SRE,LS.2.23.2015.SRE,LS_Donor_SS_Samples_P1,D1,C510,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.2.23.2015.SRE,1, +LS_9_28_2014_SRE,LS.9.28.2014.SRE,LS_Donor_SS_Samples_P1,E1,C503,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.9.28.2014.SRE,1, +LS_12_14_2013_SRE,LS.12.14.2013.SRE,LS_Donor_SS_Samples_P1,F1,C511,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.12.14.2013.SRE,1, +LS_4_7_2013_SRE,LS.4.7.2013.SRE,LS_Donor_SS_Samples_P1,G1,C504,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.4.7.2013.SRE,1, +LS_7_14_2013_SRE,LS.7.14.2013.SRE,LS_Donor_SS_Samples_P1,H1,C512,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.7.14.2013.SRE,1, +LS_10_27_2013_SRE,LS.10.27.2013.SRE,LS_Donor_SS_Samples_P1,I1,C505,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.10.27.2013.SRE,1, +LS_1_19_2014_SRE,LS.1.19.2014.SRE,LS_Donor_SS_Samples_P1,J1,C513,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.1.19.2014.SRE,1, +LS_9_3_2013_SRE,LS.9.3.2013.SRE,LS_Donor_SS_Samples_P1,K1,C506,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.9.3.2013.SRE,1, +LS_2_25_2013_SRE,LS.2.25.2013.SRE,LS_Donor_SS_Samples_P1,L1,C514,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.2.25.2013.SRE,1, +LS_7_26_2015_SRE,LS.7.26.2015.SRE,LS_Donor_SS_Samples_P1,M1,C507,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.7.26.2015.SRE,1, +LS_2_17_2014_SRE,LS.2.17.2014.SRE,LS_Donor_SS_Samples_P1,N1,C515,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.2.17.2014.SRE,1, +LS_6_29_2015_SRE,LS.6.29.2015.SRE,LS_Donor_SS_Samples_P1,O1,C508,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.6.29.2015.SRE,1, +LS_3_24_2015_SRE,LS.3.24.2015.SRE,LS_Donor_SS_Samples_P1,P1,C516,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.3.24.2015.SRE,1, +LS_1_6_2015_SRE,LS.1.6.2015.SRE,LS_Donor_SS_Samples_P1,A2,C517,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.1.6.2015.SRE,1, +T_LS_7_15_15B_SRE,T.LS.7.15.15B.SRE,LS_Donor_SS_Samples_P1,B2,C525,Tellseq_Shortread_Metagenomic_Analysis_10283,T.LS.7.15.15B.SRE,1, +LS_6_9_2013_SRE,LS.6.9.2013.SRE,LS_Donor_SS_Samples_P1,C2,C518,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.6.9.2013.SRE,1, +Person A_SRE,Person A.SRE,LS_Donor_SS_Samples_P1,D2,C526,Tellseq_Shortread_Metagenomic_Analysis_10283,Person A.SRE,1, +LS_8_22_2014_R2_SRE,LS.8.22.2014.R2.SRE,LS_Donor_SS_Samples_P1,E2,C519,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.8.22.2014.R2.SRE,1, +Person B_SRE,Person B.SRE,LS_Donor_SS_Samples_P1,F2,C527,Tellseq_Shortread_Metagenomic_Analysis_10283,Person B.SRE,1, +LS_8_22_2014_R1_SRE,LS.8.22.2014.R1.SRE,LS_Donor_SS_Samples_P1,G2,C520,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.8.22.2014.R1.SRE,1, +Person C_SRE,Person C.SRE,LS_Donor_SS_Samples_P1,H2,C528,Tellseq_Shortread_Metagenomic_Analysis_10283,Person C.SRE,1, +LS_12_28_2011_SRE,LS.12.28.2011.SRE,LS_Donor_SS_Samples_P1,I2,C521,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.12.28.2011.SRE,1, +Person D_SRE,Person D.SRE,LS_Donor_SS_Samples_P1,J2,C529,Tellseq_Shortread_Metagenomic_Analysis_10283,Person D.SRE,1, +LS_5_4_2014_SRE,LS.5.4.2014.SRE,LS_Donor_SS_Samples_P1,K2,C522,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.5.4.2014.SRE,1, +45208_1_1,45208.1.1,UROBIOME_TEST_MF_SAMPLES_P2,L2,C530,Tellseq_Shortread_Metagenomic_Analysis_10283,45208.1.1,1, +LS_11_6_2012_SRE,LS.11.6.2012.SRE,LS_Donor_SS_Samples_P1,M2,C523,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.11.6.2012.SRE,1, +45248_2_2,45248.2.2,UROBIOME_TEST_MF_SAMPLES_P2,N2,C531,Tellseq_Shortread_Metagenomic_Analysis_10283,45248.2.2,1, +LS_4_3_2012_SRE,LS.4.3.2012.SRE,LS_Donor_SS_Samples_P1,O2,C524,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.4.3.2012.SRE,1, +45261_2_1,45261.2.1,UROBIOME_TEST_MF_SAMPLES_P2,P2,C532,Tellseq_Shortread_Metagenomic_Analysis_10283,45261.2.1,1, +45272_11_2,45272.11.2,UROBIOME_TEST_MF_SAMPLES_P2,A3,C533,Tellseq_Shortread_Metagenomic_Analysis_10283,45272.11.2,1, +T_LS_7_12_15A,T.LS.7.12.15A,Larry_Smarr_Plus_Donor_Samples_P3,B3,C541,Tellseq_Shortread_Metagenomic_Analysis_10283,T.LS.7.12.15A,1, +45316_8_1,45316.8.1,UROBIOME_TEST_MF_SAMPLES_P2,C3,C534,Tellseq_Shortread_Metagenomic_Analysis_10283,45316.8.1,1, +T_LS_7_8_15A,T.LS.7.8.15A,Larry_Smarr_Plus_Donor_Samples_P3,D3,C542,Tellseq_Shortread_Metagenomic_Analysis_10283,T.LS.7.8.15A,1, +45327_7_2,45327.7.2,UROBIOME_TEST_MF_SAMPLES_P2,E3,C535,Tellseq_Shortread_Metagenomic_Analysis_10283,45327.7.2,1, +LS_8_10_2013,LS.8.10.2013,LS_Time_Series_ABSQ_P4,F3,C543,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.8.10.2013,1, +45272_1_swab_2,45272.1.swab.2,UROBIOME_TEST_MF_SAMPLES_P2,G3,C536,Tellseq_Shortread_Metagenomic_Analysis_10283,45272.1.swab.2,1, +LS_6_29_2015,LS.6.29.2015,LS_Time_Series_ABSQ_P4,H3,C544,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.6.29.2015,1, +45326_1_swab_2,45326.1.swab.2,UROBIOME_TEST_MF_SAMPLES_P2,I3,C537,Tellseq_Shortread_Metagenomic_Analysis_10283,45326.1.swab.2,1, +LS_3_8_2015,LS.3.8.2015,LS_Time_Series_ABSQ_P4,J3,C545,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.3.8.2015,1, +T_LS_7_19_15A,T.LS.7.19.15A,Larry_Smarr_Plus_Donor_Samples_P3,K3,C538,Tellseq_Shortread_Metagenomic_Analysis_10283,T.LS.7.19.15A,1, +LS_4_29_2013,LS.4.29.2013,LS_Time_Series_ABSQ_P4,L3,C546,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.4.29.2013,1, +T_LS_7_15_15B,T.LS.7.15.15B,Larry_Smarr_Plus_Donor_Samples_P3,M3,C539,Tellseq_Shortread_Metagenomic_Analysis_10283,T.LS.7.15.15B,1, +LS_11_16_2014,LS.11.16.2014,LS_Time_Series_ABSQ_P4,N3,C547,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.11.16.2014,1, +T_LS_7_19_15B,T.LS.7.19.15B,Larry_Smarr_Plus_Donor_Samples_P3,O3,C540,Tellseq_Shortread_Metagenomic_Analysis_10283,T.LS.7.19.15B,1, +LS_1_19_2014,LS.1.19.2014,LS_Time_Series_ABSQ_P4,P3,C548,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.1.19.2014,1, +LS_3_24_2015,LS.3.24.2015,LS_Time_Series_ABSQ_P4,A4,C549,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.3.24.2015,1, +LS_2_8_2013,LS.2.8.2013,LS_Time_Series_ABSQ_P4,B4,C557,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.2.8.2013,1, +LS_11_10_2013,LS.11.10.2013,LS_Time_Series_ABSQ_P4,C4,C550,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.11.10.2013,1, +Marine_Sediment_0_2cm_R1,Marine.Sediment.0.2cm.R1,MarineSediment_Donor_LarrySmarr_NoProK_P5,D4,C558,Tellseq_Shortread_Metagenomic_Analysis_10283,Marine.Sediment.0.2cm.R1,1, +LS_3_23_2014,LS.3.23.2014,LS_Time_Series_ABSQ_P4,E4,C551,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.3.23.2014,1, +Marine_Sediment_5_7cm_R1,Marine.Sediment.5.7cm.R1,MarineSediment_Donor_LarrySmarr_NoProK_P5,F4,C559,Tellseq_Shortread_Metagenomic_Analysis_10283,Marine.Sediment.5.7cm.R1,1, +LS_1_14_2015,LS.1.14.2015,LS_Time_Series_ABSQ_P4,G4,C552,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.1.14.2015,1, +Marine_Sediment_10_12cm_R2,Marine.Sediment.10.12cm.R2,MarineSediment_Donor_LarrySmarr_NoProK_P5,H4,C560,Tellseq_Shortread_Metagenomic_Analysis_10283,Marine.Sediment.10.12cm.R2,1, +LS_8_25_2014,LS.8.25.2014,LS_Time_Series_ABSQ_P4,I4,C553,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.8.25.2014,1, +Marine_Sediment_15_17cm_R1,Marine.Sediment.15.17cm.R1,MarineSediment_Donor_LarrySmarr_NoProK_P5,J4,C561,Tellseq_Shortread_Metagenomic_Analysis_10283,Marine.Sediment.15.17cm.R1,1, +LS_1_26_2013,LS.1.26.2013,LS_Time_Series_ABSQ_P4,K4,C554,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.1.26.2013,1, +Marine_Sediment_20_22cm_R1,Marine.Sediment.20.22cm.R1,MarineSediment_Donor_LarrySmarr_NoProK_P5,L4,C562,Tellseq_Shortread_Metagenomic_Analysis_10283,Marine.Sediment.20.22cm.R1,1, +LS_6_16_2014,LS.6.16.2014,LS_Time_Series_ABSQ_P4,M4,C555,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.6.16.2014,1, +Marine_Sediment_25_27cm_R2,Marine.Sediment.25.27cm.R2,MarineSediment_Donor_LarrySmarr_NoProK_P5,N4,C563,Tellseq_Shortread_Metagenomic_Analysis_10283,Marine.Sediment.25.27cm.R2,1, +LS_7_27_2014,LS.7.27.2014,LS_Time_Series_ABSQ_P4,O4,C556,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.7.27.2014,1, +Marine_Sediment_30_32cm_R3,Marine.Sediment.30.32cm.R3,MarineSediment_Donor_LarrySmarr_NoProK_P5,P4,C564,Tellseq_Shortread_Metagenomic_Analysis_10283,Marine.Sediment.30.32cm.R3,1, +Person_A_R3,Person.A.R3,MarineSediment_Donor_LarrySmarr_NoProK_P5,A5,C565,Tellseq_Shortread_Metagenomic_Analysis_10283,Person.A.R3,1, +Soil_SynCom_T4_2_Tube5,Soil.SynCom.T4.2.Tube5,16_member_community_native_soil_P6,B5,C573,Tellseq_Shortread_Metagenomic_Analysis_10283,Soil.SynCom.T4.2.Tube5,1, +Person_B_R2,Person.B.R2,MarineSediment_Donor_LarrySmarr_NoProK_P5,C5,C566,Tellseq_Shortread_Metagenomic_Analysis_10283,Person.B.R2,1, +A21,A21,Tumor_Community_P7,D5,C574,Tellseq_Shortread_Metagenomic_Analysis_10283,A21,1, +Person_C_R4,Person.C.R4,MarineSediment_Donor_LarrySmarr_NoProK_P5,E5,C567,Tellseq_Shortread_Metagenomic_Analysis_10283,Person.C.R4,1, +A23,A23,Tumor_Community_P7,F5,C575,Tellseq_Shortread_Metagenomic_Analysis_10283,A23,1, +Person_D_R2,Person.D.R2,MarineSediment_Donor_LarrySmarr_NoProK_P5,G5,C568,Tellseq_Shortread_Metagenomic_Analysis_10283,Person.D.R2,1, +A27,A27,Tumor_Community_P7,H5,C576,Tellseq_Shortread_Metagenomic_Analysis_10283,A27,1, +Soil_SynCom_T1_2_Tube1,Soil.SynCom.T1.2.Tube1,16_member_community_native_soil_P6,I5,C569,Tellseq_Shortread_Metagenomic_Analysis_10283,Soil.SynCom.T1.2.Tube1,1, +A30,A30,Tumor_Community_P7,J5,C577,Tellseq_Shortread_Metagenomic_Analysis_10283,A30,1, +Soil _SynCom_T2_2_Tube2,Soil .SynCom.T2.2.Tube2,16_member_community_native_soil_P6,K5,C570,Tellseq_Shortread_Metagenomic_Analysis_10283,Soil .SynCom.T2.2.Tube2,1, +A31,A31,Tumor_Community_P7,L5,C578,Tellseq_Shortread_Metagenomic_Analysis_10283,A31,1, +Soil_SynCom_T3_2_Tube3,Soil.SynCom.T3.2.Tube3,16_member_community_native_soil_P6,M5,C571,Tellseq_Shortread_Metagenomic_Analysis_10283,Soil.SynCom.T3.2.Tube3,1, +S1_T1_A,S1.T1.A,Tumor_Community_P7,N5,C579,Tellseq_Shortread_Metagenomic_Analysis_10283,S1.T1.A,1, +Soil_SynCom_T4_1_Tube4,Soil.SynCom.T4.1.Tube4,16_member_community_native_soil_P6,O5,C572,Tellseq_Shortread_Metagenomic_Analysis_10283,Soil.SynCom.T4.1.Tube4,1, +S2_T1_B_A,S2.T1.B.A,Tumor_Community_P7,P5,C580,Tellseq_Shortread_Metagenomic_Analysis_10283,S2.T1.B.A,1, +S2_T1_01BH1_Y_A,S2.T1.01BH1.Y.A,Tumor_Community_P7,A6,C581,Tellseq_Shortread_Metagenomic_Analysis_10283,S2.T1.01BH1.Y.A,1, +S1_T1_1CIM_A,S1.T1.1CIM.A,Tumor_Community_P7,B6,C589,Tellseq_Shortread_Metagenomic_Analysis_10283,S1.T1.1CIM.A,1, +S2_MT1_1HBI_Y_A,S2.MT1.1HBI.Y.A,Tumor_Community_P7,C6,C582,Tellseq_Shortread_Metagenomic_Analysis_10283,S2.MT1.1HBI.Y.A,1, +S1_M1_B_1CIM_A,S1.M1.B.1CIM.A,Tumor_Community_P7,D6,C590,Tellseq_Shortread_Metagenomic_Analysis_10283,S1.M1.B.1CIM.A,1, +S1_T1_B_LBM_A,S1.T1.B.LBM.A,Tumor_Community_P7,E6,C583,Tellseq_Shortread_Metagenomic_Analysis_10283,S1.T1.B.LBM.A,1, +BLANK_K15_cancer_patient,BLANK.K15.cancer.patient,Tumor_Community_P7,F6,C591,Tellseq_Shortread_Metagenomic_Analysis_10283,BLANK.K15.cancer.patient,1, +S2_MT1_LBM_A,S2.MT1.LBM.A,Tumor_Community_P7,G6,C584,Tellseq_Shortread_Metagenomic_Analysis_10283,S2.MT1.LBM.A,1, +BLANK_M15_cancer_patient,BLANK.M15.cancer.patient,Tumor_Community_P7,H6,C592,Tellseq_Shortread_Metagenomic_Analysis_10283,BLANK.M15.cancer.patient,1, +S2_T1_A,S2.T1.A,Tumor_Community_P7,I6,C585,Tellseq_Shortread_Metagenomic_Analysis_10283,S2.T1.A,1, +BLANK_O15_cancer_patient,BLANK.O15.cancer.patient,Tumor_Community_P7,J6,C593,Tellseq_Shortread_Metagenomic_Analysis_10283,BLANK.O15.cancer.patient,1, +1CIM_M_CNTL_A,1CIM.M.CNTL.A,Tumor_Community_P7,K6,C586,Tellseq_Shortread_Metagenomic_Analysis_10283,1CIM.M.CNTL.A,1, +BLANK_A17_cancer_patient,BLANK.A17.cancer.patient,Tumor_Community_P7,L6,C594,Tellseq_Shortread_Metagenomic_Analysis_10283,BLANK.A17.cancer.patient,1, +1CIM_G_CNTL_A,1CIM.G.CNTL.A,Tumor_Community_P7,M6,C587,Tellseq_Shortread_Metagenomic_Analysis_10283,1CIM.G.CNTL.A,1, +BLANK_C17_cancer_patient,BLANK.C17.cancer.patient,Tumor_Community_P7,N6,C595,Tellseq_Shortread_Metagenomic_Analysis_10283,BLANK.C17.cancer.patient,1, +GC_1HCOM_A,GC.1HCOM.A,Tumor_Community_P7,O6,C588,Tellseq_Shortread_Metagenomic_Analysis_10283,GC.1HCOM.A,1, +BLANK_E17_cancer_patient,BLANK.E17.cancer.patient,Tumor_Community_P7,P6,C596,Tellseq_Shortread_Metagenomic_Analysis_10283,BLANK.E17.cancer.patient,1, +,,,,,,,, +[Bioinformatics],,,,,,,, +Sample_Project,QiitaID,BarcodesAreRC,ForwardAdapter,ReverseAdapter,HumanFiltering,library_construction_protocol,experiment_design_description,contains_replicates +Tellseq_Shortread_Metagenomic_Analysis_10283,10283,TRUE,GATCGGAAGAGCACACGTCTGAACTCCAGTCAC,GATCGGAAGAGCGTCGTGTAGGGAAAGGAGTGT,TRUE,tellseq,tellseq metagenomics,FALSE +,,,,,,,, +[Contact],,,,,,,, +Sample_Project,Email,,,,,,, +Tellseq_Shortread_Metagenomic_Analysis_10283,cbrenchy@gmail.com,,,,,,, +,,,,,,,, +[SampleContext],,,,,,,, +sample_name,sample_type,primary_qiita_study,secondary_qiita_studies,,,,, +BLANK.K15.cancer.patient,control blank,10283,,,,,, +BLANK.M15.cancer.patient,control blank,10283,,,,,, +BLANK.O15.cancer.patient,control blank,10283,,,,,, +BLANK.A17.cancer.patient,control blank,10283,,,,,, +BLANK.C17.cancer.patient,control blank,10283,,,,,, +BLANK.E17.cancer.patient,control blank,10283,,,,,, \ No newline at end of file diff --git a/sequence_processing_pipeline/tests/data/tellseq_output/integrate_test.sbatch b/sequence_processing_pipeline/tests/data/tellseq_output/integrate_test.sbatch new file mode 100644 index 00000000..f7a53198 --- /dev/null +++ b/sequence_processing_pipeline/tests/data/tellseq_output/integrate_test.sbatch @@ -0,0 +1,67 @@ +#!/bin/bash -l +#SBATCH -J integrate +#SBATCH --time 96:00:00 +#SBATCH --mem 16G +#SBATCH -N 1 +#SBATCH -c 4 +#SBATCH -p qiita +#SBATCH --array=1-96 + +#SBATCH --output sequence_processing_pipeline/tests/2caa8226-cf69-45a3-bd40-1e90ec3d18d0/TRIntegrateJob/logs/integrate_%x_%A_%a.out +#SBATCH --error sequence_processing_pipeline/tests/2caa8226-cf69-45a3-bd40-1e90ec3d18d0/TRIntegrateJob/logs/integrate_%x_%A_%a.err + +set -x +set -e + +samples=($(cat sequence_processing_pipeline/tests/2caa8226-cf69-45a3-bd40-1e90ec3d18d0/TRIntegrateJob/sample_index_list.txt | cut -f 2)) +sample=${samples[$((${SLURM_ARRAY_TASK_ID} - 1))]} + +export TMPDIR=sequence_processing_pipeline/tests/2caa8226-cf69-45a3-bd40-1e90ec3d18d0/TRIntegrateJob/tmp + +# get list of samples and determine which sample this array instance will work +# on. +samples=($(cat sequence_processing_pipeline/tests/2caa8226-cf69-45a3-bd40-1e90ec3d18d0/TRIntegrateJob/sample_index_list.txt | cut -f 2)) +sample=${samples[$((${SLURM_ARRAY_TASK_ID} - 1))]} + +echo "Processing sample ${sample}..." + +# make temp directory +export TMPDIR=sequence_processing_pipeline/tests/2caa8226-cf69-45a3-bd40-1e90ec3d18d0/TRIntegrateJob/tmp +mkdir -p $TMPDIR + + +# TODO: All three input files must be non-zero in length. +# If possible, do this check as part of normal FSR operation. +# Previously this was done right here BEFORE integrating, rather +# than after. + +# NB: non-zero file-length check removed for now. This should be performed +# by FSR after processing is done. +# TODO: Make sure raw_fastq_dir is TellReadJob/Full +r1_in=sequence_processing_pipeline/tests/2caa8226-cf69-45a3-bd40-1e90ec3d18d0/TellReadJob/Full/TellReadJob_R1_${sample}.fastq.gz.corrected.err_barcode_removed.fastq +r2_in=sequence_processing_pipeline/tests/2caa8226-cf69-45a3-bd40-1e90ec3d18d0/TellReadJob/Full/TellReadJob_R2_${sample}.fastq.gz.corrected.err_barcode_removed.fastq +i1_in=sequence_processing_pipeline/tests/2caa8226-cf69-45a3-bd40-1e90ec3d18d0/TellReadJob/Full/TellReadJob_I1_${sample}.fastq.gz.corrected.err_barcode_removed.fastq + +# create output directory +mkdir -p sequence_processing_pipeline/tests/2caa8226-cf69-45a3-bd40-1e90ec3d18d0/TRIntegrateJob/integrated + +# generate output file names +r1_out=sequence_processing_pipeline/tests/2caa8226-cf69-45a3-bd40-1e90ec3d18d0/TRIntegrateJob/integrated/${sample}.R1.fastq.gz +r2_out=sequence_processing_pipeline/tests/2caa8226-cf69-45a3-bd40-1e90ec3d18d0/TRIntegrateJob/integrated/${sample}.R2.fastq.gz +i1_out=sequence_processing_pipeline/tests/2caa8226-cf69-45a3-bd40-1e90ec3d18d0/TRIntegrateJob/integrated/${sample}.I1.fastq.gz + +# generate 'integrated' I1 fastq.gz file. We do this as part of each array so +# they're done in parallel. +gzip -c ${i1_in} > ${i1_out} + +# generate integrated R1 and R2 fastq.gz files. +conda activate qp-knight-lab-processing-2022.03 + +python sequence_processing_pipeline/contrib/integrate-indices-np.py integrate \ +--no-sort \ +--r1-in ${r1_in} \ +--r2-in ${r2_in} \ +--i1-in ${i1_in} \ +--r1-out ${r1_out} \ +--r2-out ${r2_out} \ +--threads 4 \ No newline at end of file diff --git a/sequence_processing_pipeline/tests/data/tellseq_output/tellread_test.sbatch b/sequence_processing_pipeline/tests/data/tellseq_output/tellread_test.sbatch new file mode 100644 index 00000000..9dc3ccff --- /dev/null +++ b/sequence_processing_pipeline/tests/data/tellseq_output/tellread_test.sbatch @@ -0,0 +1,47 @@ +#!/bin/bash -l +#SBATCH -J tellread +#SBATCH -p qiita +#SBATCH -N 1 +#SBATCH -c 4 +#SBATCH --mem 16G +#SBATCH --time 96:00:00 + +#SBATCH --output sequence_processing_pipeline/tests/2caa8226-cf69-45a3-bd40-1e90ec3d18d0/TellReadJob/logs/tellread_%x-%A.out +#SBATCH --error sequence_processing_pipeline/tests/2caa8226-cf69-45a3-bd40-1e90ec3d18d0/TellReadJob/logs/tellread_%x-%A.err + +set -x + +module load singularity_3.6.4 +$HOME/qiita-spots/tellread-release-novaseqX/run_tellread_sing.sh \ + -i sequence_processing_pipeline/tests/data/sample_run_directories/150629_SN1001_0511_AH5L7GBCXX \ + -o sequence_processing_pipeline/tests/2caa8226-cf69-45a3-bd40-1e90ec3d18d0/TellReadJob \ + -s $(echo C501,C509,C502,C510,C503,C511,C504,C512,C505,C513,C506,C514,C507,C515,C508,C516,C517,C525,C518,C526,C519,C527,C520,C528,C521,C529,C522,C530,C523,C531,C524,C532,C533,C541,C534,C542,C535,C543,C536,C544,C537,C545,C538,C546,C539,C547,C540,C548,C549,C557,C550,C558,C551,C559,C552,C560,C553,C561,C554,C562,C555,C563,C556,C564,C565,C573,C566,C574,C567,C575,C568,C576,C569,C577,C570,C578,C571,C579,C572,C580,C581,C589,C582,C590,C583,C591,C584,C592,C585,C593,C586,C594,C587,C595,C588,C596 | tr -d '"') \ + -g $(echo NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE | tr -d '"') \ + -j ${SLURM_JOB_CPUS_PER_NODE} \ + -l s_1 + +# get the timestamp for the most recently changed file in directory '.' + +# hard-limit for wait time set to ~ 8 hours. +# (4 checks per hour, for 8 hours equals 32 iterations) +for i in $(seq 1 32); +do + before="$(find sequence_processing_pipeline/tests/2caa8226-cf69-45a3-bd40-1e90ec3d18d0/TellReadJob/Full -type f -printf '%T@\n' | sort -n | tail -1)" + # assume TellReadJob is finished if ctime hasn't changed in 15 minutes + # for any fastq file in the directory. + sleep 900 + after="$(find sequence_processing_pipeline/tests/2caa8226-cf69-45a3-bd40-1e90ec3d18d0/TellReadJob/Full -type f -printf '%T@\n' | sort -n | tail -1)" + + echo "$before $after" + + if [[ "$before" == "$after" ]]; then + echo "DONE" + exit 0 + else + echo "NOT DONE" + fi +done + +# if we've reached this point then we've exceeded our hard-limit for waiting. +# return w/an error. +exit 1 diff --git a/sequence_processing_pipeline/tests/test_ConvertJob.py b/sequence_processing_pipeline/tests/test_ConvertJob.py index e8879864..f394cfb8 100644 --- a/sequence_processing_pipeline/tests/test_ConvertJob.py +++ b/sequence_processing_pipeline/tests/test_ConvertJob.py @@ -955,7 +955,7 @@ def test_error_msg_from_logs(self): # an internal method to force submit_job() to raise a JobFailedError # instead of submitting the job w/sbatch and waiting for a failed - # job w/sacct. + # job w/squeue. self.assertTrue(job._toggle_force_job_fail()) error_msg = ("This job died.\n2024-01-01T12:12:12Z thread 99999 ERROR:" diff --git a/sequence_processing_pipeline/tests/test_FastQCJob.py b/sequence_processing_pipeline/tests/test_FastQCJob.py index 28fe52cb..a2291296 100644 --- a/sequence_processing_pipeline/tests/test_FastQCJob.py +++ b/sequence_processing_pipeline/tests/test_FastQCJob.py @@ -1121,7 +1121,7 @@ def test_error_msg_from_logs(self): # an internal method to force submit_job() to raise a JobFailedError # instead of submitting the job w/sbatch and waiting for a failed - # job w/sacct. + # job w/squeue. self.assertTrue(job._toggle_force_job_fail()) try: diff --git a/sequence_processing_pipeline/tests/test_Job.py b/sequence_processing_pipeline/tests/test_Job.py index 7aa5889a..192709b6 100644 --- a/sequence_processing_pipeline/tests/test_Job.py +++ b/sequence_processing_pipeline/tests/test_Job.py @@ -1,10 +1,10 @@ import unittest from sequence_processing_pipeline.Job import Job from sequence_processing_pipeline.PipelineError import PipelineError -from os.path import abspath, join, dirname -from os import makedirs +from os.path import abspath, join, dirname, split, isdir +from os import makedirs, chmod, remove from functools import partial -from shutil import rmtree +from shutil import rmtree, copyfile import re @@ -14,7 +14,10 @@ def setUp(self): def tearDown(self): for some_path in self.remove_these: - rmtree(some_path) + if isdir(some_path): + rmtree(some_path) + else: + remove(some_path) def test_system_call(self): package_root = abspath('./sequence_processing_pipeline') @@ -123,6 +126,168 @@ def test_extract_project_names_from_fastq_dir(self): obs = job.extract_project_names_from_fastq_dir(tmp) self.assertEqual(obs, ['NPH_15288']) + def test_query_slurm(self): + package_root = abspath('./sequence_processing_pipeline') + base_path = partial(join, package_root, 'tests', 'data') + + # set up a fake job so that we can test the query_jobs() method. + # it doesn't matter what the parameters are so long as the job + # passes initialization. + job = Job(base_path('211021_A00000_0000_SAMPLE'), + base_path('7b9d7d9c-2cd4-4d54-94ac-40e07a713585'), + '200nnn_xnnnnn_nnnn_xxxxxxxxxx', ['ls'], 2, None) + + # locate python binary path + # we have a python script called fake_squeue.py that can simulate + # repeated calls to squeue. It does this by generating a fake random + # set of array job ids for each job id passed to it and records their + # state in my_state.json. Each array job is set to change state from + # RUNNING to either COMPLETED or FAILED between three to seven squeue + # calls. The choice of which job-ids will succeed or fail, as is which + # individual array-ids will succeed or fail is random. + python_path = split(job._which('python'))[0] + squeue_path = join(python_path, 'squeue') + foo = join(package_root, 'scripts', 'fake_squeue.py') + + # place the fake squeue file in a place that's known to be in the + # PATH. Make sure this file is removed after this test is complete. + # Also make sure the saved state file is removed. + copyfile(foo, squeue_path) + chmod(squeue_path, 0o755) + self.remove_these.append(squeue_path) + self.remove_these.append(join(package_root, 'scripts', + 'my_state.json')) + + job_ids = ['1234567', '1234568', '1234569', '1234570'] + jobs = job._query_slurm(job_ids) + + # jobs is a dictionary of unique array_ids and/or job-ids for non- + # array jobs. The faked squeue reports anywhere between five and + # fifteen array jobs for a given job-id. After the first invocation + # all processes should be in the 'RUNNING' state. + # e.g.: "1234567_1": "RUNNING" + + for j in jobs: + self.assertEqual(jobs[j], 'RUNNING') + if '_' in j: + jid, aid = j.split('_') + else: + jid = j + aid = None + + # assert the job id component of the array-id is a valid job id. + self.assertIn(jid, job_ids) + + if aid: + # assert the array-id component of the array-id is between 0 + # and 15 as defined in the fake squeue script. + aid = int(aid) + self.assertLess(aid, 15) + self.assertGreaterEqual(aid, 0) + + def test_query_slurm_single_job(self): + # perform test_query_slurm() but with a single job only. + package_root = abspath('./sequence_processing_pipeline') + base_path = partial(join, package_root, 'tests', 'data') + + # set up a fake job so that we can test the query_jobs() method. + # it doesn't matter what the parameters are so long as the job + # passes initialization. + job = Job(base_path('211021_A00000_0000_SAMPLE'), + base_path('7b9d7d9c-2cd4-4d54-94ac-40e07a713585'), + '200nnn_xnnnnn_nnnn_xxxxxxxxxx', ['ls'], 2, None) + + # locate python binary path + # we have a python script called fake_squeue.py that can simulate + # repeated calls to squeue. It does this by generating a fake random + # set of array job ids for each job id passed to it and records their + # state in my_state.json. Each array job is set to change state from + # RUNNING to either COMPLETED or FAILED between three to seven squeue + # calls. The choice of which job-ids will succeed or fail, as is which + # individual array-ids will succeed or fail is random. + python_path = split(job._which('python'))[0] + squeue_path = join(python_path, 'squeue') + foo = join(package_root, 'scripts', 'fake_squeue.py') + + # place the fake squeue file in a place that's known to be in the + # PATH. Make sure this file is removed after this test is complete. + # Also make sure the saved state file is removed. + copyfile(foo, squeue_path) + chmod(squeue_path, 0o755) + self.remove_these.append(squeue_path) + self.remove_these.append(join(package_root, 'scripts', + 'my_state.json')) + + job_ids = ['1234567'] + jobs = job._query_slurm(job_ids) + + # jobs is a dictionary of unique array_ids and/or job-ids for non- + # array jobs. The faked squeue reports anywhere between five and + # fifteen array jobs for a given job-id. After the first invocation + # all processes should be in the 'RUNNING' state. + # e.g.: "1234567_1": "RUNNING" + + for j in jobs: + self.assertEqual(jobs[j], 'RUNNING') + if '_' in j: + jid, aid = j.split('_') + else: + jid = j + aid = None + + # assert the job id component of the array-id is a valid job id. + self.assertIn(jid, job_ids) + + if aid: + # assert the array-id component of the array-id is between 0 + # and 15 as defined in the fake squeue script. + aid = int(aid) + self.assertLess(aid, 15) + self.assertGreaterEqual(aid, 0) + + def test_wait_on_job_ids(self): + package_root = abspath('./sequence_processing_pipeline') + base_path = partial(join, package_root, 'tests', 'data') + + job = Job(base_path('211021_A00000_0000_SAMPLE'), + base_path('7b9d7d9c-2cd4-4d54-94ac-40e07a713585'), + '200nnn_xnnnnn_nnnn_xxxxxxxxxx', ['ls'], 2, None) + + python_path = split(job._which('python'))[0] + squeue_path = join(python_path, 'squeue') + foo = join(package_root, 'scripts', 'fake_squeue.py') + copyfile(foo, squeue_path) + chmod(squeue_path, 0o755) + self.remove_these.append(squeue_path) + self.remove_these.append(join(package_root, 'scripts', + 'my_state.json')) + + job_ids = ['1', '2', '3', '4'] + + # to shorten the test time, set polling_interval_in_seconds to be + # lower than one minute. + Job.polling_interval_in_seconds = 10 + results = job.wait_on_job_ids(job_ids) + + # calling query_slurm one more time after wait_on_job_ids() is called + # will technically advance the counter one more, which means that this + # doesn't confirm that wait_on_job_ids() doesn't return before EVERY + # single job is either COMPLETED or FAILED. However it does confirm + # that wait_on_job_ids() doesn't return once the FIRST completed array + # job is either COMPLETED or FAILED while others are still RUNNING. + # This was previously an issue. + obs = job._query_slurm(job_ids) + + for array_id in obs: + state = obs[array_id] + # w/out relying on states defined in Job, simply confirm all are + # either COMPLETED or FAILED. + self.assertIn(state, ['COMPLETED', 'FAILED']) + + # since wait_on_job_ids() now returns the same data structure as + # query_slurm(), they should be equal. + self.assertDictEqual(obs, results) + if __name__ == '__main__': unittest.main() diff --git a/sequence_processing_pipeline/tests/test_NuQCJob.py b/sequence_processing_pipeline/tests/test_NuQCJob.py index a3cfdada..8737552b 100644 --- a/sequence_processing_pipeline/tests/test_NuQCJob.py +++ b/sequence_processing_pipeline/tests/test_NuQCJob.py @@ -9,7 +9,7 @@ ) from os import makedirs, remove from metapool import load_sample_sheet -import glob +from os import walk class TestNuQCJob(unittest.TestCase): @@ -996,7 +996,7 @@ def test_error_msg_from_logs(self): # an internal method to force submit_job() to raise a JobFailedError # instead of submitting the job w/sbatch and waiting for a failed - # job w/sacct. + # job w/squeue. self.assertTrue(job._toggle_force_job_fail()) try: @@ -2208,10 +2208,56 @@ def test_generate_mmi_filter_cmds_w_annotate_fastq(self): self.assertEqual(obs, exp) def test_move_trimmed(self): - # Note: this test does not make use of the output_dir that other - # tests use. + # create a NuQCJob() object, but do not call run(). + # instead we will manually create some files to test with. + double_db_paths = ["db_path/mmi_1.db", "db_path/mmi_2.db"] + job = NuQCJob( + self.fastq_root_path, + self.output_path, + self.good_sample_sheet_path, + double_db_paths, + "queue_name", + 1, + 1440, + "8", + "fastp", + "minimap2", + "samtools", + [], + self.qiita_job_id, + 1000, + "", + self.movi_path, + self.gres_value, + self.pmls_path, + ['BX'] + ) - for dummy_fp in SAMPLE_DIR: + sample_dir = [ + "NuQCJob/only-adapter-filtered/EP890158A02_S58_L001_R1_001." + "interleave.fastq.gz", + "NuQCJob/only-adapter-filtered/EP890158A02_S58_L001_R2_001." + "interleave.fastq.gz", + "NuQCJob/only-adapter-filtered/EP023801B04_S27_L001_R1_001." + "interleave.fastq.gz", + "NuQCJob/only-adapter-filtered/EP023801B04_S27_L001_R2_001." + "interleave.fastq.gz", + "NuQCJob/NPH_15288/fastp_reports_dir/html/EP890158A02_S58_L001_" + "R1_001.html", + "NuQCJob/NPH_15288/fastp_reports_dir/json/EP023801B04_S27_L001_" + "R1_001.json", + "NuQCJob/process_all_fastq_files.sh", + "NuQCJob/hds-a439513a-5fcc-4f29-a1e5-902ee5c1309d.1897981." + "completed", + "NuQCJob/logs/slurm-1897981_1.out", + "NuQCJob/tmp/hds-a439513a-5fcc-4f29-a1e5-902ee5c1309d-1", + 'NuQCJob/only-adapter-filtered/CDPH-SAL_' + 'Salmonella_Typhi_MDL-150__S36_L001_R1_001.interleave.fastq.gz', + 'NuQCJob/only-adapter-filtered/CDPH-SAL_' + 'Salmonella_Typhi_MDL-150__S36_L001_R2_001.interleave.fastq.gz', + ] + + for dummy_fp in sample_dir: dummy_fp = self.path(dummy_fp) dummy_path = dirname(dummy_fp) makedirs(dummy_path, exist_ok=True) @@ -2220,38 +2266,33 @@ def test_move_trimmed(self): trimmed_only_path = self.path("NuQCJob", "only-adapter-filtered") - NuQCJob._move_trimmed_files("NPH_15288", trimmed_only_path) - - new_path = join(trimmed_only_path, "NPH_15288") - pattern = f"{new_path}/*.fastq.gz" - - exp = [ - ( - "only-adapter-filtered/NPH_15288/359180345_S58_L001_R1_001." - "fastq.gz" - ), - ( - "only-adapter-filtered/NPH_15288/359180337_S27_L001_R1_001." - "fastq.gz" - ), - ( - "only-adapter-filtered/NPH_15288/359180338_S51_L001_R2_001." - "fastq.gz" - ), - ( - "only-adapter-filtered/NPH_15288/359180338_S51_L001_R1_001." - "fastq.gz" - ), - ( - "only-adapter-filtered/NPH_15288/359180337_S27_L001_R2_001." - "fastq.gz" - ), - ] - - for trimmed_file in list(glob.glob(pattern)): - trimmed_file = trimmed_file.split("NuQCJob/")[-1] - if trimmed_file not in exp: - self.assertIn(trimmed_file, exp) + # test _move_trimmed_files() by verifying that only the interleave + # fastq files from the NYU project are moved. + job._move_trimmed_files("NYU_BMS_Melanoma_13059", trimmed_only_path) + + new_path = join(trimmed_only_path, "NYU_BMS_Melanoma_13059") + + exp = { + 'NuQCJob/only-adapter-filtered/NYU_BMS_Melanoma_13059/EP890158A02' + '_S58_L001_R1_001.interleave.fastq.gz', + 'NuQCJob/only-adapter-filtered/NYU_BMS_Melanoma_13059/EP023801B04' + '_S27_L001_R1_001.interleave.fastq.gz', + 'NuQCJob/only-adapter-filtered/NYU_BMS_Melanoma_13059/EP890158A02' + '_S58_L001_R2_001.interleave.fastq.gz', + 'NuQCJob/only-adapter-filtered/NYU_BMS_Melanoma_13059/EP023801B04' + '_S27_L001_R2_001.interleave.fastq.gz' + } + + obs = [] + for root, dirs, files in walk(new_path): + for some_file in files: + some_path = join(root, some_file) + some_path = some_path.replace(self.path(""), "") + obs.append(some_path) + + # confirm that only the samples in NYU_BMS_Melanoma_13059 were + # moved. + self.assertEqual(set(obs), exp) def _helper(self, regex, good_names, bad_names): for good_name in good_names: @@ -2263,27 +2304,5 @@ def _helper(self, regex, good_names, bad_names): self.assertIsNone(substr, msg=f"Regex failed on {bad_name}") -SAMPLE_DIR = [ - "NuQCJob/only-adapter-filtered/359180345_S58_L001_R1_001.fastq.gz", - "NuQCJob/only-adapter-filtered/359180337_S27_L001_R1_001.fastq.gz", - "NuQCJob/only-adapter-filtered/359180338_S51_L001_R2_001.fastq.gz", - "NuQCJob/only-adapter-filtered/359180338_S51_L001_R1_001.fastq.gz", - "NuQCJob/only-adapter-filtered/359180337_S27_L001_R2_001.fastq.gz", - "NuQCJob/NPH_15288/fastp_reports_dir/html/359180354_S22_L001_R1_001.html", - "NuQCJob/NPH_15288/fastp_reports_dir/html/359180338_S51_L001_R1_001.html", - "NuQCJob/NPH_15288/fastp_reports_dir/html/359180345_S58_L001_R1_001.html", - "NuQCJob/NPH_15288/fastp_reports_dir/html/359180337_S27_L001_R1_001.html", - "NuQCJob/NPH_15288/fastp_reports_dir/html/359180353_S17_L001_R1_001.html", - "NuQCJob/NPH_15288/fastp_reports_dir/json/359180353_S17_L001_R1_001.json", - "NuQCJob/NPH_15288/fastp_reports_dir/json/359180337_S27_L001_R1_001.json", - "NuQCJob/NPH_15288/fastp_reports_dir/json/359180345_S58_L001_R1_001.json", - "NuQCJob/NPH_15288/fastp_reports_dir/json/359180338_S51_L001_R1_001.json", - "NuQCJob/NPH_15288/fastp_reports_dir/json/359180354_S22_L001_R1_001.json", - "NuQCJob/process_all_fastq_files.sh", - "NuQCJob/hds-a439513a-5fcc-4f29-a1e5-902ee5c1309d.1897981.completed", - "NuQCJob/logs/slurm-1897981_1.out", - "NuQCJob/tmp/hds-a439513a-5fcc-4f29-a1e5-902ee5c1309d-1", -] - if __name__ == "__main__": unittest.main() diff --git a/sequence_processing_pipeline/tests/test_Pipeline.py b/sequence_processing_pipeline/tests/test_Pipeline.py index 53267c7d..0af31439 100644 --- a/sequence_processing_pipeline/tests/test_Pipeline.py +++ b/sequence_processing_pipeline/tests/test_Pipeline.py @@ -140,7 +140,7 @@ def test_validate_mapping_file_numeric_ids(self): with NamedTemporaryFile() as tmp: self._make_mapping_file(tmp.name) exp = ['1.0', '1e-3'] - pipeline = Pipeline(self.good_config_file, self.good_run_id, None, + pipeline = Pipeline(self.good_config_file, self.good_run_id, tmp.name, self.output_file_path, self.qiita_id, Pipeline.AMPLICON_PTYPE) @@ -153,7 +153,7 @@ def test_validate_mapping_file_numeric_ids(self): def test_get_sample_names_from_sample_sheet(self): pipeline = Pipeline(self.good_config_file, self.good_run_id, - self.mp_sheet_path, None, + self.mp_sheet_path, self.output_file_path, self.qiita_id, Pipeline.METAGENOMIC_PTYPE) @@ -178,7 +178,7 @@ def test_get_sample_names_from_sample_sheet(self): def test_get_orig_names_from_sheet_with_replicates(self): pipeline = Pipeline(self.good_config_file, self.good_run_id, - self.good_sheet_w_replicates, None, + self.good_sheet_w_replicates, self.output_file_path, self.qiita_id, Pipeline.METAGENOMIC_PTYPE) @@ -198,7 +198,7 @@ def test_required_file_checks(self): with self.assertRaisesRegex(PipelineError, "required file 'RunInfo.xml" "' is not present."): Pipeline(self.good_config_file, self.good_run_id, - self.good_sample_sheet_path, None, + self.good_sample_sheet_path, self.output_file_path, self.qiita_id, Pipeline.METAGENOMIC_PTYPE) @@ -210,7 +210,7 @@ def test_required_file_checks(self): with self.assertRaisesRegex(PipelineError, "required file 'RTAComplete" ".txt' is not present."): Pipeline(self.good_config_file, self.good_run_id, - self.good_sample_sheet_path, None, + self.good_sample_sheet_path, self.output_file_path, self.qiita_id, Pipeline.METAGENOMIC_PTYPE) @@ -222,7 +222,7 @@ def test_required_file_checks(self): with self.assertRaisesRegex(PipelineError, "RunInfo.xml is present, bu" "t not readable"): Pipeline(self.good_config_file, self.good_run_id, - self.good_sample_sheet_path, None, + self.good_sample_sheet_path, self.output_file_path, self.qiita_id, Pipeline.METAGENOMIC_PTYPE) self.make_runinfo_file_readable() @@ -232,7 +232,7 @@ def test_creation(self): with self.assertRaises(PipelineError) as e: Pipeline(self.bad_config_file, self.good_run_id, - self.good_sample_sheet_path, None, + self.good_sample_sheet_path, self.output_file_path, self.qiita_id, Pipeline.METAGENOMIC_PTYPE) @@ -249,7 +249,7 @@ def test_creation(self): " valid sample-sheet."): Pipeline(self.good_config_file, self.good_run_id, - self.bad_assay_type_path, None, + self.bad_assay_type_path, self.output_file_path, self.qiita_id, Pipeline.METAGENOMIC_PTYPE) @@ -257,7 +257,7 @@ def test_creation(self): with self.assertRaises(PipelineError) as e: Pipeline(self.invalid_config_file, self.good_run_id, - self.good_sample_sheet_path, None, + self.good_sample_sheet_path, self.output_file_path, self.qiita_id, Pipeline.METAGENOMIC_PTYPE) @@ -268,7 +268,7 @@ def test_creation(self): with self.assertRaises(PipelineError) as e: Pipeline(None, self.good_run_id, - self.good_sample_sheet_path, None, + self.good_sample_sheet_path, self.output_file_path, self.qiita_id, Pipeline.METAGENOMIC_PTYPE) @@ -279,7 +279,7 @@ def test_creation(self): with self.assertRaises(PipelineError) as e: Pipeline(self.good_config_file, self.invalid_run_id, - self.good_sample_sheet_path, None, + self.good_sample_sheet_path, self.output_file_path, self.qiita_id, Pipeline.METAGENOMIC_PTYPE) @@ -290,7 +290,7 @@ def test_creation(self): with self.assertRaises(PipelineError) as e: Pipeline(self.good_config_file, None, - self.good_sample_sheet_path, None, + self.good_sample_sheet_path, self.output_file_path, self.qiita_id, Pipeline.METAGENOMIC_PTYPE) @@ -300,7 +300,7 @@ def test_creation(self): "not a valid json file"): Pipeline(self.good_sample_sheet_path, self.good_run_id, - self.good_sample_sheet_path, None, + self.good_sample_sheet_path, self.output_file_path, self.qiita_id, Pipeline.METAGENOMIC_PTYPE) @@ -323,7 +323,7 @@ def test_creation(self): "bad.json'"): Pipeline(self.good_config_file, self.good_run_id, - self.good_sample_sheet_path, None, + self.good_sample_sheet_path, self.output_file_path, self.qiita_id, Pipeline.METAGENOMIC_PTYPE) @@ -345,7 +345,7 @@ def test_creation(self): "bad.json'"): Pipeline(self.good_config_file, self.good_run_id, - self.good_sample_sheet_path, None, + self.good_sample_sheet_path, self.output_file_path, self.qiita_id, Pipeline.METAGENOMIC_PTYPE) @@ -368,7 +368,7 @@ def test_creation(self): "bad.json'"): Pipeline(self.good_config_file, self.good_run_id, - self.good_sample_sheet_path, None, + self.good_sample_sheet_path, self.output_file_path, self.qiita_id, Pipeline.METAGENOMIC_PTYPE) @@ -379,7 +379,7 @@ def test_sample_sheet_validation(self): # contained w/in its 'message' member. try: Pipeline(self.good_config_file, self.good_run_id, - self.good_sample_sheet_path, None, + self.good_sample_sheet_path, self.output_file_path, self.qiita_id, Pipeline.METAGENOMIC_PTYPE) except PipelineError as e: @@ -389,7 +389,7 @@ def test_sample_sheet_validation(self): # test unsuccessful validation of a bad sample-sheet with self.assertRaises(PipelineError) as e: Pipeline(self.good_config_file, self.good_run_id, - self.bad_sample_sheet_path, None, + self.bad_sample_sheet_path, self.output_file_path, self.qiita_id, Pipeline.METAGENOMIC_PTYPE) self.assertEqual(str(e.exception), ('Sample-sheet contains errors:\n' @@ -401,7 +401,6 @@ def test_generate_sample_information_files(self): # test sample-information-file generation. pipeline = Pipeline(self.good_config_file, self.good_run_id, self.good_sample_sheet_path, - None, self.output_file_path, self.qiita_id, Pipeline.METAGENOMIC_PTYPE) @@ -511,6 +510,62 @@ def test_generate_sample_information_files(self): exp = exp_last_lines[some_name] self.assertEqual(obs, exp) + def test_generate_sample_information_files_with_additional_meta(self): + # TODO: With recent changes is this needed? + # test sample-information-file generation. + pipeline = Pipeline(self.good_config_file, self.good_run_id, + self.good_sample_sheet_path, + self.output_file_path, self.qiita_id, + Pipeline.METAGENOMIC_PTYPE) + + # create a dataframe with duplicate information to pass to + # generate_sample_information_files(). Confirm that the duplicates + # are dropped. Confirm 'NOTBLANK_999A' is also filtered out. + df = pd.DataFrame(data=[('BLANK999_999A', 'NYU_BMS_Melanoma_13059'), + ('BLANK999_999A', 'NYU_BMS_Melanoma_13059'), + ('NOTBLANK_999A', 'NYU_BMS_Melanoma_13059')], + columns=['sample_name', 'project_name']) + + sif_path = pipeline.generate_sample_info_files(addl_info=df) + + # get the path for the NYU_BMS_Melanoma dataset. + sif_path = [x for x in sif_path if 'NYU_BMS_Melanoma' in x][0] + + exp_first_line = ("BLANK1.1A\t2021-10-21\t193\t" + "Control\tNegative\tSterile water blank\t" + "Sterile water blank\turban biome\t" + "research facility\tsterile water\t" + "misc environment\tUSA:CA:San Diego\t" + "BLANK1.1A\t32.5\t-117.25\tcontrol blank\t" + "metagenome\t256318\tBLANK1.1A\t" + "NYU_BMS_Melanoma\tTRUE\tUCSD\tFALSE") + + exp_last_line = ("BLANK4.4H\t2021-10-21\t193\tControl\tNegative\t" + "Sterile water blank\tSterile water blank\t" + "urban biome\tresearch facility\tsterile water\t" + "misc environment\tUSA:CA:San Diego\tBLANK4.4H\t" + "32.5\t-117.25\tcontrol blank\tmetagenome\t256318\t" + "BLANK4.4H\tNYU_BMS_Melanoma\tTRUE\tUCSD\tFALSE") + + with open(sif_path, 'r') as f: + obs_lines = f.readlines() + + # confirm that each file contains the expected header. + header = obs_lines[0].strip() + self.assertEqual(header, '\t'.join(Pipeline.sif_header)) + + # confirm that the first line of each file is as expected. + obs = obs_lines[1].strip() + exp = exp_first_line + + self.assertEqual(obs, exp) + + # confirm that the last line of each file is as expected. + obs = obs_lines[-1].strip() + exp = exp_last_line + + self.assertEqual(obs, exp) + def test_get_sample_ids(self): exp_sample_ids = ['CDPH-SAL__Salmonella__Typhi__MDL-143', 'CDPH-SAL_Salmonella_Typhi_MDL-144', @@ -980,7 +1035,7 @@ def test_get_sample_ids(self): 'EP400448B04', 'EP479894B04'] # test sample-information-file generation. pipeline = Pipeline(self.good_config_file, self.good_run_id, - self.good_sample_sheet_path, None, + self.good_sample_sheet_path, self.output_file_path, self.qiita_id, Pipeline.METAGENOMIC_PTYPE) @@ -1456,7 +1511,7 @@ def test_get_sample_names(self): # test sample-information-file generation. pipeline = Pipeline(self.good_config_file, self.good_run_id, - self.good_sample_sheet_path, None, + self.good_sample_sheet_path, self.output_file_path, self.qiita_id, Pipeline.METAGENOMIC_PTYPE) @@ -1484,7 +1539,7 @@ def test_get_project_info(self): # test sample-information-file generation. pipeline = Pipeline(self.good_config_file, self.good_run_id, - self.good_sample_sheet_path, None, + self.good_sample_sheet_path, self.output_file_path, self.qiita_id, Pipeline.METAGENOMIC_PTYPE) @@ -1515,7 +1570,7 @@ def test_get_project_info(self): self.assertEqual(sorted(obs_project_names), sorted(exp_project_names)) pipeline = Pipeline(self.good_config_file, self.good_run_id, - self.good_sheet_w_replicates, None, + self.good_sheet_w_replicates, self.output_file_path, self.qiita_id, Pipeline.METAGENOMIC_PTYPE) @@ -1527,7 +1582,7 @@ def test_get_project_info(self): def test_configuration_profiles(self): pipeline = Pipeline(self.good_config_file, self.good_run_id, - self.good_sample_sheet_path, None, + self.good_sample_sheet_path, self.output_file_path, self.qiita_id, Pipeline.METAGENOMIC_PTYPE) @@ -1556,7 +1611,7 @@ def test_configuration_profiles(self): def test_parse_project_name(self): # test sample-information-file generation. pipeline = Pipeline(self.good_config_file, self.good_run_id, - self.good_sample_sheet_path, None, + self.good_sample_sheet_path, self.output_file_path, self.qiita_id, Pipeline.METAGENOMIC_PTYPE) @@ -1588,7 +1643,7 @@ def test_parse_project_name(self): def test_identify_reserved_words(self): pipeline = Pipeline(self.good_config_file, self.good_run_id, - self.good_sample_sheet_path, None, + self.good_sample_sheet_path, self.output_file_path, self.qiita_id, Pipeline.METAGENOMIC_PTYPE) @@ -1605,7 +1660,7 @@ def test_identify_reserved_words(self): # create new pipeline using a/legacy (v90) metagenomic sample-sheet. pipeline = Pipeline(self.good_config_file, self.good_run_id, - self.good_legacy_sheet_path, None, + self.good_legacy_sheet_path, self.output_file_path, self.qiita_id, Pipeline.METAGENOMIC_PTYPE) @@ -1698,7 +1753,7 @@ def test_required_file_checks(self): with self.assertRaisesRegex(PipelineError, "required file 'RunInfo.xml" "' is not present."): Pipeline(self.good_config_file, self.good_run_id, - None, self.good_mapping_file_path, + self.good_mapping_file_path, self.output_file_path, self.qiita_id, Pipeline.AMPLICON_PTYPE) @@ -1710,7 +1765,7 @@ def test_required_file_checks(self): with self.assertRaisesRegex(PipelineError, "required file 'RTAComplete" ".txt' is not present."): Pipeline(self.good_config_file, self.good_run_id, - None, self.good_mapping_file_path, + self.good_mapping_file_path, self.output_file_path, self.qiita_id, Pipeline.AMPLICON_PTYPE) @@ -1721,7 +1776,7 @@ def test_required_file_checks(self): with self.assertRaisesRegex(PipelineError, "RunInfo.xml is present, " "but not readable"): - Pipeline(self.good_config_file, self.good_run_id, None, + Pipeline(self.good_config_file, self.good_run_id, self.good_mapping_file_path, self.output_file_path, self.qiita_id, Pipeline.AMPLICON_PTYPE) self.make_runinfo_file_readable() @@ -1731,7 +1786,7 @@ def test_creation(self): with self.assertRaises(PipelineError) as e: Pipeline(self.bad_config_file, self.good_run_id, - None, self.good_mapping_file_path, + self.good_mapping_file_path, self.output_file_path, self.qiita_id, Pipeline.AMPLICON_PTYPE) @@ -1746,7 +1801,7 @@ def test_creation(self): with self.assertRaises(PipelineError) as e: Pipeline(self.invalid_config_file, self.good_run_id, - None, self.good_mapping_file_path, + self.good_mapping_file_path, self.output_file_path, self.qiita_id, Pipeline.AMPLICON_PTYPE) @@ -1757,7 +1812,7 @@ def test_creation(self): with self.assertRaises(PipelineError) as e: Pipeline(None, self.good_run_id, - None, self.good_mapping_file_path, + self.good_mapping_file_path, self.output_file_path, self.qiita_id, Pipeline.AMPLICON_PTYPE) @@ -1768,7 +1823,7 @@ def test_creation(self): with self.assertRaises(PipelineError) as e: Pipeline(self.good_config_file, self.invalid_run_id, - None, self.good_mapping_file_path, + self.good_mapping_file_path, self.output_file_path, self.qiita_id, Pipeline.AMPLICON_PTYPE) @@ -1779,7 +1834,7 @@ def test_creation(self): with self.assertRaises(PipelineError) as e: Pipeline(self.good_config_file, None, - None, self.good_mapping_file_path, + self.good_mapping_file_path, self.output_file_path, self.qiita_id, Pipeline.AMPLICON_PTYPE) @@ -1787,7 +1842,7 @@ def test_mapping_file_validation(self): # test successful validation of a good mapping-file. try: Pipeline(self.good_config_file, self.good_run_id, - None, self.good_mapping_file_path, + self.good_mapping_file_path, self.output_file_path, self.qiita_id, Pipeline.AMPLICON_PTYPE) except PipelineError as e: @@ -1797,7 +1852,7 @@ def test_mapping_file_validation(self): # test unsuccessful validation of a bad mapping-file. with self.assertRaises(PipelineError) as e: Pipeline(self.good_config_file, self.good_run_id, - None, self.mf_missing_column, + self.mf_missing_column, self.output_file_path, self.qiita_id, Pipeline.AMPLICON_PTYPE) self.assertEqual(str(e.exception), ('Mapping-file is missing ' @@ -1807,7 +1862,7 @@ def test_mapping_file_validation(self): # test unsuccessful validation of a bad mapping-file. with self.assertRaises(PipelineError) as e: Pipeline(self.good_config_file, self.good_run_id, - None, self.mf_duplicate_sample, + self.mf_duplicate_sample, self.output_file_path, self.qiita_id, Pipeline.AMPLICON_PTYPE) self.assertEqual(str(e.exception), ("Mapping-file contains duplicate " @@ -1834,7 +1889,6 @@ def test_is_sample_sheet(self): def test_generate_sample_information_files(self): # test sample-information-file generation. pipeline = Pipeline(self.good_config_file, self.good_run_id, - None, self.good_mapping_file_path, self.output_file_path, self.qiita_id, @@ -2056,7 +2110,6 @@ def test_get_sample_ids(self): # test sample-information-file generation. pipeline = Pipeline(self.good_config_file, self.good_run_id, - None, self.good_mapping_file_path, self.output_file_path, self.qiita_id, @@ -2184,7 +2237,6 @@ def test_get_sample_names(self): # test sample-information-file generation. pipeline = Pipeline(self.good_config_file, self.good_run_id, - None, self.good_mapping_file_path, self.output_file_path, self.qiita_id, @@ -2208,7 +2260,7 @@ def test_get_project_info(self): # test sample-information-file generation. pipeline = Pipeline(self.good_config_file, self.good_run_id, - None, self.good_mapping_file_path, + self.good_mapping_file_path, self.output_file_path, self.qiita_id, Pipeline.AMPLICON_PTYPE) @@ -2225,33 +2277,12 @@ def test_get_project_info(self): self.assertDictEqual(obs_d, exp_d) break - def test_additional_constuctor_check(self): - with self.assertRaisesRegex(PipelineError, ("sample_sheet_path or " - "mapping_file_path must " - "be defined, but not " - "both.")): - Pipeline(self.good_config_file, self.good_run_id, - None, None, - self.output_file_path, - self.qiita_id, Pipeline.AMPLICON_PTYPE) - - with self.assertRaisesRegex(PipelineError, ("sample_sheet_path or " - "mapping_file_path must " - "be defined, but not " - "both.")): - Pipeline(self.good_config_file, self.good_run_id, - self.sample_sheet_path, - self.good_mapping_file_path, - self.output_file_path, - self.qiita_id, Pipeline.AMPLICON_PTYPE) - def test_dummy_sheet_generation(self): # generate a RunInfo.xml file w/only one indexed read. self.create_runinfo_file(four_reads=False) _ = Pipeline(self.good_config_file, self.good_run_id, - None, self.good_mapping_file_path, self.output_file_path, self.qiita_id, @@ -2270,7 +2301,6 @@ def test_dummy_sheet_generation(self): _ = Pipeline(self.good_config_file, self.good_run_id, - None, self.good_mapping_file_path, self.output_file_path, self.qiita_id, @@ -2290,7 +2320,6 @@ def test_dummy_sheet_generation(self): def test_process_run_info_file(self): pipeline = Pipeline(self.good_config_file, self.good_run_id, - None, self.good_mapping_file_path, self.output_file_path, self.qiita_id, @@ -2330,7 +2359,6 @@ def test_process_run_info_file(self): def test_identify_reserved_words(self): pipeline = Pipeline(self.good_config_file, self.good_run_id, - None, self.good_mapping_file_path, self.output_file_path, self.qiita_id, diff --git a/sequence_processing_pipeline/tests/test_SeqCountsJob.py b/sequence_processing_pipeline/tests/test_SeqCountsJob.py new file mode 100644 index 00000000..d641c3b2 --- /dev/null +++ b/sequence_processing_pipeline/tests/test_SeqCountsJob.py @@ -0,0 +1,74 @@ +from os.path import join +from sequence_processing_pipeline.SeqCountsJob import SeqCountsJob +from functools import partial +import unittest +from json import load as json_load + + +class TestSeqCountsJob(unittest.TestCase): + def setUp(self): + package_root = "sequence_processing_pipeline" + self.path = partial(join, package_root, "tests") + # where 2caa8226-cf69-45a3-bd40-1e90ec3d18d0 is a random qiita job id. + self.exp = self.path('data', 'tellseq_output', 'integrate_test.sbatch') + + # where 150629_SN1001_0511_AH5L7GBCXX is a run-directory that already + # exists. + self.run_dir = self.path('data', 'sample_run_directories', + '150629_SN1001_0511_AH5L7GBCXX') + + self.output_path = self.path('2caa8226-cf69-45a3-bd40-1e90ec3d18d0') + + self.files_to_count_path = self.path("data", "files_to_count.txt") + + self.queue_name = "qiita" + self.node_count = "1" + self.wall_time_limit = "1440" + self.jmem = "8" + self.modules_to_load = [] + self.qiita_job_id = "2caa8226-cf69-45a3-bd40-1e90ec3d18d0" + self.cores_per_task = "1" + self.raw_fastq_dir = join(self.output_path, "TellReadJob", "Full") + self.max_array_length = 100 + self.exp_sbatch_output = self.path("data", "seq_counts.sbatch") + self.exp_results = self.path("data", + "aggregate_counts_results.json") + + def test_creation(self): + def compare_files(obs, exp): + with open(obs, 'r') as f: + obs_lines = f.readlines() + obs_lines = [x.strip() for x in obs_lines] + obs_lines = [x for x in obs_lines if x != ''] + + with open(exp, 'r') as f: + exp_lines = f.readlines() + exp_lines = [x.strip() for x in exp_lines] + exp_lines = [x for x in exp_lines if x != ''] + + for obs_line, exp_line in zip(obs_lines, exp_lines): + self.assertEqual(obs_line, exp_line) + + # test basic good-path + job = SeqCountsJob(self.run_dir, self.output_path, self.queue_name, + self.node_count, self.wall_time_limit, self.jmem, + self.modules_to_load, self.qiita_job_id, + self.max_array_length, self.files_to_count_path, + self.cores_per_task) + + obs = job._generate_job_script() + + compare_files(obs, self.exp_sbatch_output) + + # hack log path so that it points to test data directory rather than + # the output directory for a run we didn't run(). + job.log_path = self.path("data", "seq_counts_logs") + + obs = json_load(open(job._aggregate_counts(), 'r')) + exp = json_load(open(self.exp_results, 'r')) + + self.assertDictEqual(obs, exp) + + +if __name__ == '__main__': + unittest.main() diff --git a/sequence_processing_pipeline/tests/test_TRIntegrateJob.py b/sequence_processing_pipeline/tests/test_TRIntegrateJob.py new file mode 100644 index 00000000..17ded346 --- /dev/null +++ b/sequence_processing_pipeline/tests/test_TRIntegrateJob.py @@ -0,0 +1,72 @@ +from os.path import join +from sequence_processing_pipeline.TRIntegrateJob import TRIntegrateJob +from functools import partial +import unittest + + +class TestTRIntegrateJob(unittest.TestCase): + def setUp(self): + package_root = "sequence_processing_pipeline" + self.path = partial(join, package_root, "tests") + # where 2caa8226-cf69-45a3-bd40-1e90ec3d18d0 is a random qiita job id. + self.obs = self.path('2caa8226-cf69-45a3-bd40-1e90ec3d18d0', + 'TRIntegrateJob', 'integrate_test.sbatch') + self.exp = self.path('data', 'tellseq_output', 'integrate_test.sbatch') + + # where 150629_SN1001_0511_AH5L7GBCXX is a run-directory that already + # exists. + self.run_dir = self.path('data', 'sample_run_directories', + '150629_SN1001_0511_AH5L7GBCXX') + + self.output_path = self.path('2caa8226-cf69-45a3-bd40-1e90ec3d18d0') + + self.sample_sheet_path = self.path('data', + 'tellseq_metag_dummy_sample_' + 'sheet.csv') + + self.queue_name = "qiita" + self.node_count = "1" + self.wall_time_limit = "96:00:00" + self.jmem = "16" + self.modules_to_load = ["singularity_3.6.4"] + self.qiita_job_id = "2caa8226-cf69-45a3-bd40-1e90ec3d18d0" + self.label = "150629_SN1001_0511_AH5L7GBCXX-test" + self.reference_base = "" + self.reference_map = "" + self.tmp1_path = join(self.output_path, "TRIntegrateJob", "output", + "tmp1") + # reflects location of script on host. + self.sing_script_path = ("$HOME/qiita-spots/tellread-release-novaseqX/" + "run_tellread_sing.sh") + self.lane = "1" + self.cores_per_task = "4" + self.integrate_script_path = join(package_root, "contrib", + "integrate-indices-np.py") + self.sil_path = self.path('data', 'fake_sample_index_list.txt') + self.raw_fastq_dir = join(self.output_path, "TellReadJob", "Full") + + def test_creation(self): + # test basic good-path + job = TRIntegrateJob(self.run_dir, self.output_path, + self.sample_sheet_path, self.queue_name, + self.node_count, self.wall_time_limit, + self.jmem, self.modules_to_load, + self.qiita_job_id, self.integrate_script_path, + self.sil_path, self.raw_fastq_dir, + self.reference_base, self.reference_map, + self.cores_per_task) + + job._generate_job_script() + + with open(self.obs, 'r') as f: + obs_lines = f.readlines() + + with open(self.exp, 'r') as f: + exp_lines = f.readlines() + + for obs_line, exp_line in zip(obs_lines, exp_lines): + self.assertEqual(obs_line, exp_line) + + +if __name__ == '__main__': + unittest.main() diff --git a/sequence_processing_pipeline/tests/test_TellReadJob.py b/sequence_processing_pipeline/tests/test_TellReadJob.py new file mode 100644 index 00000000..4c6a75c2 --- /dev/null +++ b/sequence_processing_pipeline/tests/test_TellReadJob.py @@ -0,0 +1,66 @@ +from os.path import join +from sequence_processing_pipeline.TellReadJob import TellReadJob +from functools import partial +import unittest + + +class TestTellReadJob(unittest.TestCase): + def setUp(self): + package_root = "sequence_processing_pipeline" + self.path = partial(join, package_root, "tests") + # where 2caa8226-cf69-45a3-bd40-1e90ec3d18d0 is a random qiita job id. + self.obs = self.path('2caa8226-cf69-45a3-bd40-1e90ec3d18d0', + 'TellReadJob', 'tellread_test.sbatch') + self.exp = self.path('data', 'tellseq_output', 'tellread_test.sbatch') + + # where 150629_SN1001_0511_AH5L7GBCXX is a run-directory that already + # exists. + self.run_dir = self.path('data', 'sample_run_directories', + '150629_SN1001_0511_AH5L7GBCXX') + + self.output_path = self.path('2caa8226-cf69-45a3-bd40-1e90ec3d18d0') + + self.sample_sheet_path = self.path('data', + 'tellseq_metag_dummy_sample_' + 'sheet.csv') + + self.queue_name = "qiita" + self.node_count = "1" + self.wall_time_limit = "96:00:00" + self.jmem = "16" + self.modules_to_load = ["singularity_3.6.4"] + self.qiita_job_id = "2caa8226-cf69-45a3-bd40-1e90ec3d18d0" + self.label = "150629_SN1001_0511_AH5L7GBCXX-test" + self.reference_base = "" + self.reference_map = "" + self.tmp1_path = join(self.output_path, "TellReadJob", "output", + "tmp1") + # reflects location of script on host. + self.sing_script_path = ("$HOME/qiita-spots/tellread-release-novaseqX/" + "run_tellread_sing.sh") + self.lane = "1" + self.cores_per_task = "4" + + def test_creation(self): + # test basic good-path + job = TellReadJob(self.run_dir, self.output_path, + self.sample_sheet_path, self.queue_name, + self.node_count, self.wall_time_limit, + self.jmem, self.modules_to_load, self.qiita_job_id, + self.reference_base, self.reference_map, + self.sing_script_path, self.cores_per_task) + + job._generate_job_script() + + with open(self.obs, 'r') as f: + obs_lines = f.readlines() + + with open(self.exp, 'r') as f: + exp_lines = f.readlines() + + for obs_line, exp_line in zip(obs_lines, exp_lines): + self.assertEqual(obs_line, exp_line) + + +if __name__ == '__main__': + unittest.main() diff --git a/sequence_processing_pipeline/tests/test_commands.py b/sequence_processing_pipeline/tests/test_commands.py index f58bb176..ac8a4bd9 100644 --- a/sequence_processing_pipeline/tests/test_commands.py +++ b/sequence_processing_pipeline/tests/test_commands.py @@ -16,12 +16,12 @@ def test_split_similar_size_bins(self, glob, stat): class MockStat: st_size = 2 ** 28 # 256MB - mockglob = ['/foo/bar/a_R1_.fastq.gz', - '/foo/bar/b_R2_.fastq.gz', - '/foo/bar/a_R2_.fastq.gz', - '/foo/baz/c_R2_.fastq.gz', - '/foo/baz/c_R1_.fastq.gz', - '/foo/bar/b_R1_.fastq.gz'] + mockglob = ['/foo/bar/a_R1_001.fastq.gz', + '/foo/bar/b_R2_001.fastq.gz', + '/foo/bar/a_R2_001.fastq.gz', + '/foo/baz/c_R2_001.fastq.gz', + '/foo/baz/c_R1_001.fastq.gz', + '/foo/bar/b_R1_001.fastq.gz'] with TemporaryDirectory() as tmp: exp = (2, 1073741824) @@ -30,9 +30,12 @@ class MockStat: obs = split_similar_size_bins('foo', 1, tmp + '/prefix') self.assertEqual(obs, exp) - exp_1 = ('/foo/bar/a_R1_.fastq.gz\t/foo/bar/a_R2_.fastq.gz\tbar\n' - '/foo/bar/b_R1_.fastq.gz\t/foo/bar/b_R2_.fastq.gz\tbar\n') - exp_2 = '/foo/baz/c_R1_.fastq.gz\t/foo/baz/c_R2_.fastq.gz\tbaz\n' + exp_1 = ('/foo/bar/a_R1_001.fastq.gz\t/foo/bar/a_R2_001.fastq.gz' + '\tbar\n' + '/foo/bar/b_R1_001.fastq.gz\t/foo/bar/b_R2_001.fastq.gz' + '\tbar\n') + exp_2 = ('/foo/baz/c_R1_001.fastq.gz\t/foo/baz/c_R2_001.fastq.gz' + '\tbaz\n') obs_1 = open(tmp + '/prefix-1').read() self.assertEqual(obs_1, exp_1) @@ -56,10 +59,6 @@ def test_demux(self): '@2::MUX::bing/2', 'ATGC', '+', '!!!!', '']) infile = io.StringIO(infile_data) - exp_data_r1 = '\n'.join(['@baz/1', 'ATGC', '+', '!!!!', - '@bing/1', 'ATGC', '+', '!!!!', '']) - exp_data_r2 = '\n'.join(['@baz/2', 'ATGC', '+', '!!!!', - '@bing/2', 'ATGC', '+', '!!!!', '']) exp_data_r1 = ['@baz/1', 'ATGC', '+', '!!!!', '@bing/1', 'ATGC', '+', '!!!!'] diff --git a/sequence_processing_pipeline/tests/test_util.py b/sequence_processing_pipeline/tests/test_util.py index 136dc9a0..e5073101 100644 --- a/sequence_processing_pipeline/tests/test_util.py +++ b/sequence_processing_pipeline/tests/test_util.py @@ -4,24 +4,18 @@ class TestUtil(unittest.TestCase): def test_iter_paired_files(self): - tests = [(['a_R1_foo', - 'b_R2_bar', - 'a_R2_baz', - 'b_R1_bing'], - [('a_R1_foo', 'a_R2_baz'), - ('b_R1_bing', 'b_R2_bar')]), - (['a.R1.foo', - 'b.R2.bar', - 'a.R2.baz', - 'b.R1.bing'], - [('a.R1.foo', 'a.R2.baz'), - ('b.R1.bing', 'b.R2.bar')]), - (['a.R1.foo', - 'b_R2_bar', - 'a.R2.baz', - 'b_R1_bing'], - [('a.R1.foo', 'a.R2.baz'), - ('b_R1_bing', 'b_R2_bar')])] + # tuples of randomly ordered fastq files and thier expected + # sorted and organized output from iter_paired_files(). + + # underscore filenames updated to require '_001.fastq.gz'. + # legacy dot filenames test remains as-is. + tests = [(['b_R2_001.fastq.gz', 'a_R1_001.fastq.gz', + 'a_R2_001.fastq.gz', 'b_R1_001.fastq.gz'], + [('a_R1_001.fastq.gz', 'a_R2_001.fastq.gz'), + ('b_R1_001.fastq.gz', 'b_R2_001.fastq.gz')]), + (['a.R1.foo', 'b.R2.bar', 'a.R2.baz', 'b.R1.bing'], + [('a.R1.foo', 'a.R2.baz'), ('b.R1.bing', 'b.R2.bar')])] + for files, exp in tests: obs = list(iter_paired_files(files)) self.assertEqual(obs, exp) @@ -42,7 +36,7 @@ def test_iter_paired_files_bad_pair(self): list(iter_paired_files(files)) def test_iter_paired_files_mismatch_prefix(self): - files = ['a_R1_foo', 'ab_R2_foo'] + files = ['a_R1_001.fastq.gz', 'ab_R2_001.fastq.gz'] with self.assertRaisesRegex(ValueError, "Mismatch prefixes"): list(iter_paired_files(files)) diff --git a/sequence_processing_pipeline/util.py b/sequence_processing_pipeline/util.py index d9586f81..e19bf98a 100644 --- a/sequence_processing_pipeline/util.py +++ b/sequence_processing_pipeline/util.py @@ -1,7 +1,18 @@ import re -PAIR_UNDERSCORE = (re.compile(r'_R1_'), '_R1_', '_R2_') +# PAIR_UNDERSCORE = (re.compile(r'_R1_'), '_R1_', '_R2_') +# The above will truncate on the first _R1_ found, which only works when _R1_ +# or _R2_ appears exactly once in a file path. When the wet-lab incorporates +# these same strings in their sample-names as descriptive metadata, this +# assumption is broken. For all raw fastq files being used as input into +# NuQCJob, we can assume they end in the following convention. Per Illumina +# spec, all fastq files end in _001 and we preserve this convention even at +# the cost of renaming output files from TRIntegrateJob. +# PAIR_DOT is kept as is, but may be removed later because for the purposes of +# SPP, no input should ever be named with dots instead of underscores. +PAIR_UNDERSCORE = (re.compile(r'_R1_001.fastq.gz'), + '_R1_001.fastq.gz', '_R2_001.fastq.gz') PAIR_DOT = (re.compile(r'\.R1\.'), '.R1.', '.R2.') PAIR_TESTS = (PAIR_UNDERSCORE, PAIR_DOT)