From b6fb6ff65f0331a00c3f603c42ecba14eee5b63f Mon Sep 17 00:00:00 2001
From: Charles Cowart <42684307+charles-cowart@users.noreply.github.com>
Date: Tue, 3 Dec 2024 10:48:35 -0800
Subject: [PATCH] Add tellread (#149)

* initial add

* initial cleanup

* first pass at converting TELLREAD scripts

* Second pass at integrating tellread scripts

* third pass adding tellread

* fourth pass

* Fifth pass, tested on qiita-rc and then refactored.

* Manually merged with current master

* Manually merged with master

* Updates based on testing in qiita-rc

* flake8

* Small fixes

* Refactor KISSLoader to be more DRY.

* Pipeline.py updated to support changes in qp-klp

* Version 2.0 of TellSeq support.

Version 2.0 of TellSeq support removes the master tellread.sh script and
the drop-in replacement TRConvertJob.py for Job()s that wrap individual
steps in the original script. These steps can be used in whole or in
part in varying order in the refactored SPP plugin (qp-klp).

* Creation tests added for new TellReadJob() class.

* flake8

* New sample files added

* Added optional parameter to Pipeline() class.

Added optional parameter to Pipeline() class that overwrites the values in the lane column of a sample-sheet's data section.
This functionality used to reside in the qp-klp plugin and is a common usage pattern.
This allows SPP to override the value in a sample-sheet's lane column with the value provided by the user at submission time.

* bugfix

* Fixes error

Fixes error found when post-processing adapter-trimmed fastq files.
All files were being moved into one of the project sub-folders, rather than into their associated folders.
This appears to be due to recent implementation change.
All files are now moved into their correct folder.

* Rewrote test

* Updated branch to use new DFSheet() functionality

* Updated to recent changes in metapool

* Update from testing

* Updates to TRIntegrateJob based on testing

* Updated sample config file

* Replaced legacy exit check for tellread

* recent updates

* Updated tests

* Update setup.py to point to merged metapool updates

* New tests for slurm polling

* Updates

* Updates

* flake8

* flake8 post merger

* Fixed older test

* Minor update

* Remove lengthy comment

* fix test

* Updates based on feedback

* Update based on feedback

* Added renamed file

* Refactored sequence counting job

Request from Antonio to make TRNormCountsJob more generalized for
current and upcoming work.
TRNormCountsJob replaced w/SeqCountsJob:
 * takes a list of paths to fastq and/or fastq.gz files.
 * runs seqtk to count sequences and bases in parallel.
 * aggregator code produces a json file of counts from log output.

* Update test based on randomness in output generation

* Updates based on feedback

* Common parse_log() method made default
---
 README.md                                     |   8 +-
 sequence_processing_pipeline/Commands.py      |   6 +
 sequence_processing_pipeline/ConvertJob.py    |   9 +-
 sequence_processing_pipeline/FastQCJob.py     |  17 -
 sequence_processing_pipeline/Job.py           | 187 ++++++----
 sequence_processing_pipeline/NuQCJob.py       |  72 ++--
 sequence_processing_pipeline/Pipeline.py      | 200 +++++++----
 sequence_processing_pipeline/SeqCountsJob.py  | 147 ++++++++
 .../TRIntegrateJob.py                         | 163 +++++++++
 sequence_processing_pipeline/TellReadJob.py   | 174 +++++++++
 .../aggregate_counts.py                       |  40 +++
 .../contrib/create_picklist.py                |  72 ++++
 .../contrib/integrate-indices-np.py           | 332 ++++++++++++++++++
 .../contrib/plot_counts.py                    |  27 ++
 .../scripts/fake_squeue.py                    | 101 ++++++
 .../templates/integrate.sbatch                |  67 ++++
 .../templates/seq_counts.sbatch               |  25 ++
 .../templates/tellread-cleanup.sbatch         |  13 +
 .../templates/tellread.sbatch                 |  48 +++
 .../20230906_FS10001773_68_BTR67708-1611.csv  |  41 +++
 .../tests/data/aggregate_counts_results.json  |  36 ++
 .../iseq_metagenomic.json                     |  19 +
 .../tests/data/fake_sample_index_list.txt     |  96 +++++
 .../tests/data/files_to_count.txt             |   8 +
 .../tests/data/seq_counts.sbatch              |  25 ++
 .../seq_counts_logs/seq_count_2679966_1.err   |   3 +
 .../seq_counts_logs/seq_count_2679966_1.out   |   2 +
 .../seq_counts_logs/seq_count_2679966_2.err   |   3 +
 .../seq_counts_logs/seq_count_2679966_2.out   |   2 +
 .../seq_counts_logs/seq_count_2679966_3.err   |   3 +
 .../seq_counts_logs/seq_count_2679966_3.out   |   2 +
 .../seq_counts_logs/seq_count_2679966_4.err   |   3 +
 .../seq_counts_logs/seq_count_2679966_4.out   |   2 +
 .../seq_counts_logs/seq_count_2679966_5.err   |   3 +
 .../seq_counts_logs/seq_count_2679966_5.out   |   2 +
 .../seq_counts_logs/seq_count_2679966_6.err   |   3 +
 .../seq_counts_logs/seq_count_2679966_6.out   |   2 +
 .../seq_counts_logs/seq_count_2679966_7.err   |   3 +
 .../seq_counts_logs/seq_count_2679966_7.out   |   2 +
 .../seq_counts_logs/seq_count_2679966_8.err   |   3 +
 .../seq_counts_logs/seq_count_2679966_8.out   |   2 +
 .../data/tellseq_metag_dummy_sample_sheet.csv | 135 +++++++
 .../data/tellseq_output/integrate_test.sbatch |  67 ++++
 .../data/tellseq_output/tellread_test.sbatch  |  47 +++
 .../tests/test_ConvertJob.py                  |   2 +-
 .../tests/test_FastQCJob.py                   |   2 +-
 .../tests/test_Job.py                         | 173 ++++++++-
 .../tests/test_NuQCJob.py                     | 137 ++++----
 .../tests/test_Pipeline.py                    | 160 +++++----
 .../tests/test_SeqCountsJob.py                |  74 ++++
 .../tests/test_TRIntegrateJob.py              |  72 ++++
 .../tests/test_TellReadJob.py                 |  66 ++++
 .../tests/test_commands.py                    |  25 +-
 .../tests/test_util.py                        |  32 +-
 sequence_processing_pipeline/util.py          |  13 +-
 55 files changed, 2610 insertions(+), 368 deletions(-)
 create mode 100644 sequence_processing_pipeline/SeqCountsJob.py
 create mode 100644 sequence_processing_pipeline/TRIntegrateJob.py
 create mode 100644 sequence_processing_pipeline/TellReadJob.py
 create mode 100644 sequence_processing_pipeline/aggregate_counts.py
 create mode 100644 sequence_processing_pipeline/contrib/create_picklist.py
 create mode 100644 sequence_processing_pipeline/contrib/integrate-indices-np.py
 create mode 100644 sequence_processing_pipeline/contrib/plot_counts.py
 create mode 100755 sequence_processing_pipeline/scripts/fake_squeue.py
 create mode 100644 sequence_processing_pipeline/templates/integrate.sbatch
 create mode 100644 sequence_processing_pipeline/templates/seq_counts.sbatch
 create mode 100644 sequence_processing_pipeline/templates/tellread-cleanup.sbatch
 create mode 100644 sequence_processing_pipeline/templates/tellread.sbatch
 create mode 100644 sequence_processing_pipeline/tests/data/20230906_FS10001773_68_BTR67708-1611.csv
 create mode 100644 sequence_processing_pipeline/tests/data/aggregate_counts_results.json
 create mode 100644 sequence_processing_pipeline/tests/data/fake_sample_index_list.txt
 create mode 100644 sequence_processing_pipeline/tests/data/files_to_count.txt
 create mode 100644 sequence_processing_pipeline/tests/data/seq_counts.sbatch
 create mode 100644 sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_1.err
 create mode 100644 sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_1.out
 create mode 100644 sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_2.err
 create mode 100644 sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_2.out
 create mode 100644 sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_3.err
 create mode 100644 sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_3.out
 create mode 100644 sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_4.err
 create mode 100644 sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_4.out
 create mode 100644 sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_5.err
 create mode 100644 sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_5.out
 create mode 100644 sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_6.err
 create mode 100644 sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_6.out
 create mode 100644 sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_7.err
 create mode 100644 sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_7.out
 create mode 100644 sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_8.err
 create mode 100644 sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_8.out
 create mode 100644 sequence_processing_pipeline/tests/data/tellseq_metag_dummy_sample_sheet.csv
 create mode 100644 sequence_processing_pipeline/tests/data/tellseq_output/integrate_test.sbatch
 create mode 100644 sequence_processing_pipeline/tests/data/tellseq_output/tellread_test.sbatch
 create mode 100644 sequence_processing_pipeline/tests/test_SeqCountsJob.py
 create mode 100644 sequence_processing_pipeline/tests/test_TRIntegrateJob.py
 create mode 100644 sequence_processing_pipeline/tests/test_TellReadJob.py

diff --git a/README.md b/README.md
index c51fcd0c..594e11aa 100644
--- a/README.md
+++ b/README.md
@@ -14,7 +14,7 @@ git clone https://github.com/biocore/mg-scripts.git
 Create a Python3 Conda environment in which to run the notebook:
 
 ```bash
-conda create -n sp_pipeline 'python==3.9' numpy pandas click scipy matplotlib fastq-pair
+conda create --yes -n spp python='python=3.9' scikit-learn pandas numpy nose pep8 flake8 matplotlib jupyter notebook 'seaborn>=0.7.1' pip openpyxl 'seqtk>=1.4' click scipy fastq-pair
 ```
 
 Activate the Conda environment:
@@ -62,3 +62,9 @@ Please note that the setting 'minimap2_databases' is expected to be a list of pa
 For NuQCJob, minimap2_databases is expected to be the path to a directory containing two subdirectories: 'metagenomic'
 and 'metatranscriptomic'. Each directory should contain or symlink to the appropriate .mmi files needed for that Assay
 type.
+
+Additional TellSeq-related notes:
+'spades-cloudspades-0.1', 'tellread-release-novaseqX' or similar directories must be placed in a location available to SPP.
+Their paths should be made known to SPP in the configuration files. (See examples for details).
+Additional scripts found in sequence_processing_pipeline/contrib were contributed by Daniel and Omar and can be similarly located and configured.
+
diff --git a/sequence_processing_pipeline/Commands.py b/sequence_processing_pipeline/Commands.py
index b2cd5e41..ae971fc9 100644
--- a/sequence_processing_pipeline/Commands.py
+++ b/sequence_processing_pipeline/Commands.py
@@ -22,8 +22,14 @@ def split_similar_size_bins(data_location_path, max_file_list_size_in_gb,
     # is now the following:
     # add one more level to account for project_names nested under ConvertJob
     # dir.
+    # this will ignore the _I1_ reads that appear in the integrated result.
     fastq_paths = glob.glob(data_location_path + '*/*/*.fastq.gz')
 
+    # case-specific filter for TellSeq output directories that also contain
+    # _I1_ files. Ensure paths are still sorted afterwards.
+    fastq_paths = [x for x in fastq_paths if '_I1_001.fastq.gz' not in x]
+    fastq_paths = sorted(fastq_paths)
+
     # convert from GB and halve as we sum R1
     max_size = (int(max_file_list_size_in_gb) * (2 ** 30) / 2)
 
diff --git a/sequence_processing_pipeline/ConvertJob.py b/sequence_processing_pipeline/ConvertJob.py
index 122a4987..dc9b36aa 100644
--- a/sequence_processing_pipeline/ConvertJob.py
+++ b/sequence_processing_pipeline/ConvertJob.py
@@ -156,7 +156,13 @@ def run(self, callback=None):
                                        exec_from=self.log_path,
                                        callback=callback)
 
-            self.copy_controls_between_projects()
+            # ConvertJob() is used to process Amplicon as well as Meta*Omic
+            # runs. Amplicon runs use a dummy sample-sheet generated by
+            # Pipeline(). For these types of sheets we can't copy controls
+            # between projects because demuxing is not performed here.
+            _, sheet_name = split(self.sample_sheet_path)
+            if sheet_name != 'dummy_sample_sheet.csv':
+                self.copy_controls_between_projects()
 
         except JobFailedError as e:
             # When a job has failed, parse the logs generated by this specific
@@ -169,6 +175,7 @@ def run(self, callback=None):
         logging.info(f'Successful job: {job_info}')
 
     def parse_logs(self):
+        # overrides Job.parse_logs() w/tailored parse for specific logs.
         log_path = join(self.output_path, 'Logs')
         errors = join(log_path, 'Errors.log')
 
diff --git a/sequence_processing_pipeline/FastQCJob.py b/sequence_processing_pipeline/FastQCJob.py
index 5e0bf4fc..8db0440b 100644
--- a/sequence_processing_pipeline/FastQCJob.py
+++ b/sequence_processing_pipeline/FastQCJob.py
@@ -6,7 +6,6 @@
 from functools import partial
 from json import dumps
 import logging
-import glob
 
 
 class FastQCJob(Job):
@@ -305,19 +304,3 @@ def _generate_job_script(self):
 
         with open(sh_details_fp, 'w') as f:
             f.write('\n'.join(self.commands))
-
-    def parse_logs(self):
-        log_path = join(self.output_path, 'logs')
-        files = sorted(glob.glob(join(log_path, '*.out')))
-        msgs = []
-
-        for some_file in files:
-            with open(some_file, 'r') as f:
-                msgs += [line for line in f.readlines()
-                         # note 'error' is not same
-                         # requirement as found in QCJob.
-                         # ('error:'). This is a very
-                         # generalized filter.
-                         if 'error' in line.lower()]
-
-        return [msg.strip() for msg in msgs]
diff --git a/sequence_processing_pipeline/Job.py b/sequence_processing_pipeline/Job.py
index b650543e..4121bd7f 100644
--- a/sequence_processing_pipeline/Job.py
+++ b/sequence_processing_pipeline/Job.py
@@ -1,3 +1,6 @@
+from jinja2 import BaseLoader, TemplateNotFound
+from os.path import getmtime
+import pathlib
 from itertools import zip_longest
 from os import makedirs, walk
 from os.path import basename, exists, split, join
@@ -9,6 +12,26 @@
 import logging
 from inspect import stack
 import re
+from collections import Counter
+from glob import glob
+
+
+# taken from https://jinja.palletsprojects.com/en/3.0.x/api/#jinja2.BaseLoader
+class KISSLoader(BaseLoader):
+    def __init__(self, path):
+        # pin the path for loader to the location sequence_processing_pipeline
+        # (the location of this file), along w/the relative path to the
+        # templates directory.
+        self.path = join(pathlib.Path(__file__).parent.resolve(), path)
+
+    def get_source(self, environment, template):
+        path = join(self.path, template)
+        if not exists(path):
+            raise TemplateNotFound(template)
+        mtime = getmtime(path)
+        with open(path) as f:
+            source = f.read()
+        return source, path, lambda: mtime == getmtime(path)
 
 
 class Job:
@@ -32,6 +55,7 @@ class Job:
                                slurm_status_running)
 
     polling_interval_in_seconds = 60
+    squeue_retry_in_seconds = 10
 
     def __init__(self, root_dir, output_path, job_name, executable_paths,
                  max_array_length, modules_to_load=None):
@@ -103,7 +127,17 @@ def run(self):
         raise PipelineError("Base class run() method not implemented.")
 
     def parse_logs(self):
-        raise PipelineError("Base class parse_logs() method not implemented.")
+        # by default, look for anything to parse in the logs directory.
+        log_path = join(self.output_path, 'logs')
+        files = sorted(glob(join(log_path, '*')))
+        msgs = []
+
+        for some_file in files:
+            with open(some_file, 'r') as f:
+                msgs += [line for line in f.readlines()
+                         if 'error:' in line.lower()]
+
+        return [msg.strip() for msg in msgs]
 
     def _which(self, file_path, modules_to_load=None):
         """
@@ -212,6 +246,41 @@ def _system_call(self, cmd, allow_return_codes=[], callback=None):
 
         return {'stdout': stdout, 'stderr': stderr, 'return_code': return_code}
 
+    def _query_slurm(self, job_ids):
+        # query_slurm encapsulates the handling of squeue.
+        count = 0
+        while True:
+            result = self._system_call("squeue -t all -j "
+                                       f"{','.join(job_ids)} "
+                                       "-o '%i,%T'")
+
+            if result['return_code'] == 0:
+                # there was no issue w/squeue, break this loop and
+                # continue.
+                break
+            else:
+                # there was likely an intermittent issue w/squeue. Pause
+                # and wait before trying a few more times. If the problem
+                # persists then report the error and exit.
+                count += 1
+
+                if count > 3:
+                    raise ExecFailedError(result['stderr'])
+
+                sleep(Job.squeue_retry_in_seconds)
+
+        lines = result['stdout'].split('\n')
+        lines.pop(0)  # remove header
+        lines = [x.split(',') for x in lines if x != '']
+
+        jobs = {}
+        for job_id, state in lines:
+            # ensure unique_id is of type string for downstream use.
+            job_id = str(job_id)
+            jobs[job_id] = state
+
+        return jobs
+
     def wait_on_job_ids(self, job_ids, callback=None):
         '''
         Wait for the given job-ids to finish running before returning.
@@ -229,63 +298,27 @@ def wait_on_job_ids(self, job_ids, callback=None):
         # ensure all ids are strings to ensure proper working w/join().
         job_ids = [str(x) for x in job_ids]
 
-        def query_slurm(job_ids):
-            # internal function query_slurm encapsulates the handling of
-            # squeue.
-            count = 0
-            while True:
-                result = self._system_call("squeue -t all -j "
-                                           f"{','.join(job_ids)} "
-                                           "-o '%F,%A,%T'")
-
-                if result['return_code'] == 0:
-                    # there was no issue w/squeue, break this loop and
-                    # continue.
-                    break
-                else:
-                    # there was a likely intermittent issue w/squeue. Pause
-                    # and wait before trying a few more times. If the problem
-                    # persists then report the error and exit.
-                    count += 1
-
-                    if count > 3:
-                        raise ExecFailedError(result['stderr'])
-
-                    sleep(60)
-
-            lines = result['stdout'].split('\n')
-            lines.pop(0)    # remove header
-            lines = [x.split(',') for x in lines if x != '']
-
-            jobs = {}
-            child_jobs = {}
-            for job_id, unique_id, state in lines:
-                jobs[unique_id] = state
-
-                if unique_id != job_id:
-                    child_jobs[unique_id] = job_id  # job is a child job
-
-            return jobs, child_jobs
-
         while True:
-            jobs, child_jobs = query_slurm(job_ids)
-
-            for jid in job_ids:
-                logging.debug("JOB %s: %s" % (jid, jobs[jid]))
-                if callback is not None:
-                    callback(jid=jid, status=jobs[jid])
-
-                children = [x for x in child_jobs if child_jobs[x] == jid]
-                if len(children) == 0:
-                    logging.debug("\tNO CHILDREN")
-                for cid in children:
-                    logging.debug("\tCHILD JOB %s: %s" % (cid, jobs[cid]))
-            status = [jobs[x] in Job.slurm_status_not_running for x in job_ids]
-
-            if set(status) == {True}:
-                # all jobs either completed successfully or terminated.
+            # Because query_slurm only returns state on the job-ids we specify,
+            # the wait process is a simple check to see whether any of the
+            # states are 'running' states or not.
+            jobs = self._query_slurm(job_ids)
+
+            # jobs will be a dict of job-ids or array-ids for jobs that
+            # are array-jobs. the value of jobs[id] will be a state e.g.:
+            # 'RUNNING', 'FAILED', 'COMPLETED'.
+            states = [jobs[x] in Job.slurm_status_not_running for x in jobs]
+
+            if set(states) == {True}:
+                # if all the states are either FAILED or COMPLETED
+                # then the set of those states no matter how many
+                # array-jobs there were will ultimately be the set of
+                # {True}. If not then that means there are still jobs
+                # that are running.
                 break
 
+            logging.debug(f"sleeping {Job.polling_interval_in_seconds} "
+                          "seconds...")
             sleep(Job.polling_interval_in_seconds)
 
         return jobs
@@ -345,16 +378,48 @@ def submit_job(self, script_path, job_parameters=None,
         # to the user.
         results = self.wait_on_job_ids([job_id], callback=callback)
 
-        job_result = {'job_id': job_id, 'job_state': results[job_id]}
+        if job_id in results:
+            # job is a non-array job
+            job_result = {'job_id': job_id, 'job_state': results[job_id]}
+        else:
+            # job is an array job
+            # assume all array jobs in this case will be associated w/job_id.
+            counts = Counter()
+            for array_id in results:
+                counts[results[array_id]] += 1
+
+            # for array jobs we won't be returning a string representing the
+            # state of a single job. Instead we're returning a dictionary of
+            # the number of unique states the set of array-jobs ended up in and
+            # the number for each one.
+            job_result = {'job_id': job_id, 'job_state': dict(counts)}
 
         if callback is not None:
-            callback(jid=job_id, status=job_result['job_state'])
+            if isinstance(job_result['job_state'], dict):
+                # this is an array job
+                states = []
+                for key in counts:
+                    states.append(f"{key}: {counts[key]}")
 
-        if job_result['job_state'] == 'COMPLETED':
-            return job_result
+                callback(jid=job_id, status=", ".join(states))
+
+            else:
+                # this is a standard job
+                callback(jid=job_id, status=job_result['job_state'])
+
+        if isinstance(job_result['job_state'], dict):
+            states = list(job_result['job_state'].keys())
+            if states == ['COMPLETED']:
+                return job_result
+            else:
+                raise JobFailedError(f"job {job_id} exited with jobs in the "
+                                     f"following states: {', '.join(states)}")
         else:
-            raise JobFailedError(f"job {job_id} exited with status "
-                                 f"{job_result['job_state']}")
+            if job_result['job_state'] == 'COMPLETED':
+                return job_result
+            else:
+                raise JobFailedError(f"job {job_id} exited with status "
+                                     f"{job_result['job_state']}")
 
     def _group_commands(self, cmds):
         # break list of commands into chunks of max_array_length (Typically
diff --git a/sequence_processing_pipeline/NuQCJob.py b/sequence_processing_pipeline/NuQCJob.py
index b2ffd3a9..0e05b41d 100644
--- a/sequence_processing_pipeline/NuQCJob.py
+++ b/sequence_processing_pipeline/NuQCJob.py
@@ -1,8 +1,7 @@
-from jinja2 import BaseLoader, TemplateNotFound
 from metapool import load_sample_sheet
 from os import stat, makedirs, rename
-from os.path import join, basename, dirname, exists, abspath, getmtime
-from sequence_processing_pipeline.Job import Job
+from os.path import join, basename, dirname, exists, abspath, split
+from sequence_processing_pipeline.Job import Job, KISSLoader
 from sequence_processing_pipeline.PipelineError import (PipelineError,
                                                         JobFailedError)
 from sequence_processing_pipeline.Pipeline import Pipeline
@@ -11,28 +10,9 @@
 from sequence_processing_pipeline.Commands import split_similar_size_bins
 from sequence_processing_pipeline.util import iter_paired_files
 from jinja2 import Environment
-import glob
+from glob import glob
 import re
 from sys import executable
-import pathlib
-
-
-# taken from https://jinja.palletsprojects.com/en/3.0.x/api/#jinja2.BaseLoader
-class KISSLoader(BaseLoader):
-    def __init__(self, path):
-        # pin the path for loader to the location sequence_processing_pipeline
-        # (the location of this file), along w/the relative path to the
-        # templates directory.
-        self.path = join(pathlib.Path(__file__).parent.resolve(), path)
-
-    def get_source(self, environment, template):
-        path = join(self.path, template)
-        if not exists(path):
-            raise TemplateNotFound(template)
-        mtime = getmtime(path)
-        with open(path) as f:
-            source = f.read()
-        return source, path, lambda: mtime == getmtime(path)
 
 
 logging.basicConfig(level=logging.DEBUG)
@@ -128,6 +108,9 @@ def __init__(self, fastq_root_dir, output_path, sample_sheet_path,
         self.minimum_bytes = 3100
         self.fastq_regex = re.compile(r'^(.*)_S\d{1,4}_L\d{3}_R\d_\d{3}'
                                       r'\.fastq\.gz$')
+        self.interleave_fastq_regex = re.compile(r'^(.*)_S\d{1,4}_L\d{3}_R\d'
+                                                 r'_\d{3}\.interleave\.fastq'
+                                                 r'\.gz$')
         self.html_regex = re.compile(r'^(.*)_S\d{1,4}_L\d{3}_R\d_\d{3}\.html$')
         self.json_regex = re.compile(r'^(.*)_S\d{1,4}_L\d{3}_R\d_\d{3}\.json$')
 
@@ -167,7 +150,7 @@ def _filter_empty_fastq_files(self, filtered_directory,
         '''
         empty_list = []
 
-        files = glob.glob(join(filtered_directory, f'*.{self.suffix}'))
+        files = glob(join(filtered_directory, f'*.{self.suffix}'))
 
         for r1, r2 in iter_paired_files(files):
             full_path = join(filtered_directory, r1)
@@ -194,7 +177,7 @@ def _move_helper(self, completed_files, regex, samples_in_project, dst):
             substr = regex.search(file_name)
             if substr is None:
                 raise ValueError(f"{file_name} does not follow naming "
-                                 " pattern.")
+                                 "pattern.")
             else:
                 # check if found substring is a member of this
                 # project. Note sample-name != sample-id
@@ -214,8 +197,7 @@ def _move_helper(self, completed_files, regex, samples_in_project, dst):
         for fp in files_to_move:
             move(fp, dst)
 
-    @staticmethod
-    def _move_trimmed_files(project_name, output_path):
+    def _move_trimmed_files(self, project_name, output_path):
         '''
         Given output_path, move all fastqs to a new subdir named project_name.
         :param project_name: The name of the new folder to be created.
@@ -229,8 +211,16 @@ def _move_trimmed_files(project_name, output_path):
             # this directory shouldn't already exist.
             makedirs(join(output_path, project_name), exist_ok=False)
 
-            for trimmed_file in list(glob.glob(pattern)):
-                move(trimmed_file, join(output_path, project_name))
+            sample_ids = [x[0] for x in self.sample_ids
+                          if x[1] == project_name]
+
+            for trimmed_file in list(glob(pattern)):
+                file_name = split(trimmed_file)[1]
+                substr = self.interleave_fastq_regex.search(file_name)
+                if substr is not None:
+                    # only move the sample_ids in this project.
+                    if substr[1] in sample_ids:
+                        move(trimmed_file, join(output_path, project_name))
         else:
             raise ValueError(f"'{output_path}' does not exist")
 
@@ -282,10 +272,9 @@ def run(self, callback=None):
         for project in self.project_data:
             project_name = project['Sample_Project']
             needs_human_filtering = project['HumanFiltering']
-
             source_dir = join(self.output_path, project_name)
             pattern = f"{source_dir}/*.fastq.gz"
-            completed_files = list(glob.glob(pattern))
+            completed_files = list(glob(pattern))
 
             # if the 'only-adapter-filtered' directory exists, move the files
             # into a unique location so that files from multiple projects
@@ -294,7 +283,7 @@ def run(self, callback=None):
                                      'only-adapter-filtered')
 
             if exists(trimmed_only_path):
-                NuQCJob._move_trimmed_files(project_name, trimmed_only_path)
+                self._move_trimmed_files(project_name, trimmed_only_path)
 
             if needs_human_filtering is True:
                 filtered_directory = join(source_dir, 'filtered_sequences')
@@ -330,7 +319,7 @@ def run(self, callback=None):
 
             # move all html files underneath the subdirectory for this project.
             pattern = f"{old_html_path}/*.html"
-            completed_htmls = list(glob.glob(pattern))
+            completed_htmls = list(glob(pattern))
             self._move_helper(completed_htmls,
                               # Tissue_1_Super_Trizol_S19_L001_R1_001.html
                               self.html_regex,
@@ -339,7 +328,7 @@ def run(self, callback=None):
 
             # move all json files underneath the subdirectory for this project.
             pattern = f"{old_json_path}/*.json"
-            completed_jsons = list(glob.glob(pattern))
+            completed_jsons = list(glob(pattern))
             self._move_helper(completed_jsons,
                               # Tissue_1_Super_Trizol_S19_L001_R1_001.json
                               self.json_regex,
@@ -357,7 +346,7 @@ def _confirm_job_completed(self):
         # since NuQCJob processes across all projects in a run, there isn't
         # a need to iterate by project_name and job_id.
         pattern = f"{self.output_path}/hds-{self.qiita_job_id}.*.completed"
-        completed_files = list(glob.glob(pattern))
+        completed_files = list(glob(pattern))
         if completed_files:
             return True
 
@@ -510,16 +499,3 @@ def _generate_job_script(self, max_bucket_size):
                                     pmls_path=self.pmls_path))
 
         return job_script_path
-
-    def parse_logs(self):
-        log_path = join(self.output_path, 'logs')
-        # sorted lists give predictable results
-        files = sorted(glob.glob(join(log_path, '*.out')))
-        msgs = []
-
-        for some_file in files:
-            with open(some_file, 'r') as f:
-                msgs += [line for line in f.readlines()
-                         if 'error:' in line.lower()]
-
-        return [msg.strip() for msg in msgs]
diff --git a/sequence_processing_pipeline/Pipeline.py b/sequence_processing_pipeline/Pipeline.py
index 21b460c2..9a30c2a0 100644
--- a/sequence_processing_pipeline/Pipeline.py
+++ b/sequence_processing_pipeline/Pipeline.py
@@ -173,25 +173,20 @@ def get_qiita_id_from_sif_fp(fp):
         hacky_name_pieces_dict = parse_project_name(temp_name)
         return hacky_name_pieces_dict[QIITA_ID_KEY]
 
-    def __init__(self, configuration_file_path, run_id, sample_sheet_path,
-                 mapping_file_path, output_path, qiita_job_id, pipeline_type):
+    def __init__(self, configuration_file_path, run_id, input_file_path,
+                 output_path, qiita_job_id, pipeline_type, lane_number=None):
         """
         Initialize Pipeline object w/configuration information.
         :param configuration_file_path: Path to configuration.json file.
         :param run_id: Used w/search_paths to locate input run_directory.
-        :param sample_sheet_path: Path to sample-sheet.
-        :param mapping_file_path: Path to mapping file.
+        :param input_file_path: Path to sample-sheet or pre-prep file.
         :param output_path: Path where all pipeline-generated files live.
         :param qiita_job_id: Qiita Job ID creating this Pipeline.
         :param pipeline_type: Pipeline type ('Amplicon', 'Metagenomic', etc.)
+        :param lane_number: (Optional) overwrite lane_number in input_file.
         """
-        if sample_sheet_path is not None and mapping_file_path is not None:
-            raise PipelineError("sample_sheet_path or mapping_file_path "
-                                "must be defined, but not both.")
-
-        if sample_sheet_path is None and mapping_file_path is None:
-            raise PipelineError("sample_sheet_path or mapping_file_path "
-                                "must be defined, but not both.")
+        if input_file_path is None:
+            raise PipelineError("user_input_file_path cannot be None")
 
         if pipeline_type not in Pipeline.pipeline_types:
             raise PipelineError(f"'{type}' is not a valid pipeline type.")
@@ -235,22 +230,36 @@ def __init__(self, configuration_file_path, run_id, sample_sheet_path,
         self.run_id = run_id
         self.qiita_job_id = qiita_job_id
         self.pipeline = []
+        self.assay_type = None
 
-        if sample_sheet_path:
-            self.search_paths = self.configuration['search_paths']
-            self.sample_sheet = self._validate_sample_sheet(sample_sheet_path)
-            self.mapping_file = None
-        else:
+        # this method will catch a run directory as well as its products
+        # directory, which also has the same name. Hence, return the
+        # shortest matching path as that will at least return the right
+        # path between the two.
+        results = []
+
+        if pipeline_type == Pipeline.AMPLICON_PTYPE:
             self.search_paths = self.configuration['amplicon_search_paths']
-            self.mapping_file = self._validate_mapping_file(mapping_file_path)
-            # unlike _validate_sample_sheet() which returns a SampleSheet
-            # object that stores the path to the file it was created from,
-            # _validate_mapping_file() just returns a DataFrame. Store the
-            # path to the original mapping file itself as well.
-            self.mapping_file_path = mapping_file_path
-            self.sample_sheet = None
+            self.assay_type = Pipeline.AMPLICON_ATYPE
+        else:
+            self.search_paths = self.configuration['search_paths']
+
+        for search_path in self.search_paths:
+            logging.debug(f'Searching {search_path} for {self.run_id}')
+            for entry in listdir(search_path):
+                some_path = join(search_path, entry)
+                # ensure some_path never ends in '/'
+                some_path = some_path.rstrip('/')
+                if isdir(some_path) and some_path.endswith(self.run_id):
+                    logging.debug(f'Found {some_path}')
+                    results.append(some_path)
 
-        self.run_dir = self._search_for_run_dir()
+        if results:
+            results.sort(key=lambda s: len(s))
+            self.run_dir = results[0]
+        else:
+            raise PipelineError(f"A run-dir for '{self.run_id}' could not be "
+                                "found")
 
         # required files for successful operation
         # both RTAComplete.txt and RunInfo.xml should reside in the root of
@@ -268,14 +277,78 @@ def __init__(self, configuration_file_path, run_id, sample_sheet_path,
         except PermissionError:
             raise PipelineError('RunInfo.xml is present, but not readable')
 
-        if self.mapping_file is not None:
+        self.input_file_path = input_file_path
+
+        if pipeline_type == Pipeline.AMPLICON_PTYPE:
+            # assume input_file_path references a pre-prep (mapping) file.
+
+            self.mapping_file = self._validate_mapping_file(input_file_path)
+            # unlike _validate_sample_sheet() which returns a SampleSheet
+            # object that stores the path to the file it was created from,
+            # _validate_mapping_file() just returns a DataFrame. Store the
+            # path to the original mapping file itself as well.
+
             # create dummy sample-sheet
             output_fp = join(output_path, 'dummy_sample_sheet.csv')
             self.generate_dummy_sample_sheet(self.run_dir, output_fp)
-            self.sample_sheet = output_fp
+            self.dummy_sheet_path = output_fp
+
+            # Optional lane_number parameter is ignored for Amplicon
+            # runs, as the only valid value is 1.
+        else:
+            if lane_number is not None:
+                # confirm that the lane_number is a reasonable value.
+                lane_number = int(lane_number)
+                if lane_number < 1 or lane_number > 8:
+                    raise ValueError(f"'{lane_number}' is not a valid name"
+                                     " number")
+
+                # overwrite sample-sheet w/DFSheets processed version
+                # with overwritten Lane number.
+                sheet = load_sample_sheet(input_file_path)
+                with open(input_file_path, 'w') as f:
+                    sheet.write(f, lane=lane_number)
+
+            # assume user_input_file_path references a sample-sheet.
+            self.sample_sheet = self._validate_sample_sheet(input_file_path)
+            self.mapping_file = None
+
+        if self.assay_type is None:
+            # set self.assay_type for non-amplicon types.
+            assay_type = self.sample_sheet.Header['Assay']
+            if assay_type not in Pipeline.assay_types:
+                raise ValueError(f"'{assay_type} is not a valid Assay type")
+            self.assay_type = assay_type
 
         self._configure_profile()
 
+    def get_sample_sheet_path(self):
+        """
+        Returns path to a sample-sheet or dummy sample-sheet for amplicon runs.
+        """
+        if self.assay_type == Pipeline.AMPLICON_ATYPE:
+            # assume self.dummy_sheet_path has been created for amplicon runs.
+            return self.dummy_sheet_path
+        else:
+            # assume input_file_path is a sample-sheet for non-amplicon runs.
+            return self.input_file_path
+
+    def get_software_configuration(self, software):
+        if software is None or software == "":
+            raise ValueError(f"'{software}' is not a valid value")
+
+        key_order = ['profile', 'configuration', software]
+
+        config = self.config_profile
+
+        for key in key_order:
+            if key in config:
+                config = config[key]
+            else:
+                raise PipelineError(f"'{key}' is not defined in configuration")
+
+        return config
+
     def identify_reserved_words(self, words):
         '''
         Returns a list of words that should not appear as column names in any
@@ -294,7 +367,7 @@ def identify_reserved_words(self, words):
         # specifically how the proper set of prep-info file columns are
         # generated. For now the functionality will be defined here as this
         # area of metapool is currently in flux.
-        if self.mapping_file is not None:
+        if self.pipeline_type == Pipeline.AMPLICON_PTYPE:
             reserved = PREP_MF_COLUMNS
         else:
             # results will be dependent on SheetType and SheetVersion of
@@ -313,15 +386,6 @@ def _configure_profile(self):
         # from self.sample_sheet (or self.mapping_file).
         instr_type = InstrumentUtils.get_instrument_type(self.run_dir)
 
-        if isinstance(self.sample_sheet, str):
-            # if self.sample_sheet is a file instead of a KLSampleSheet()
-            # type, then this is an Amplicon run.
-            assay_type = Pipeline.AMPLICON_ATYPE
-        else:
-            assay_type = self.sample_sheet.Header['Assay']
-            if assay_type not in Pipeline.assay_types:
-                raise ValueError(f"'{assay_type} is not a valid Assay type")
-
         # open the configuration profiles directory as specified by
         # profiles_path in the configuration.json file. parse each json into
         # a nested dictionary keyed by (instrument-type, assay-type) as
@@ -381,40 +445,17 @@ def _configure_profile(self):
             i_type = profile['profile']['instrument_type']
             a_type = profile['profile']['assay_type']
 
-            if i_type == instr_type and a_type == assay_type:
+            if i_type == instr_type and a_type == self.assay_type:
                 selected_profile = profile
                 break
 
         if selected_profile is None:
-            raise ValueError(f"a matching profile ({instr_type}, {assay_type}"
-                             ") was not found. Please notify an administrator")
+            raise ValueError(f"a matching profile ({instr_type}, "
+                             f"{self.assay_type}) was not found. Please notify"
+                             " an administrator")
 
         self.config_profile = selected_profile
 
-    def _search_for_run_dir(self):
-        # this method will catch a run directory as well as its products
-        # directory, which also has the same name. Hence, return the
-        # shortest matching path as that will at least return the right
-        # path between the two.
-        results = []
-
-        for search_path in self.search_paths:
-            logging.debug(f'Searching {search_path} for {self.run_id}')
-            for entry in listdir(search_path):
-                some_path = join(search_path, entry)
-                # ensure some_path never ends in '/'
-                some_path = some_path.rstrip('/')
-                if isdir(some_path) and some_path.endswith(self.run_id):
-                    logging.debug(f'Found {some_path}')
-                    results.append(some_path)
-
-        if results:
-            results.sort(key=lambda s: len(s))
-            return results[0]
-
-        raise PipelineError(f"A run-dir for '{self.run_id}' could not be "
-                            "found")
-
     def _directory_check(self, directory_path, create=False):
         if exists(directory_path):
             logging.debug("directory '%s' exists." % directory_path)
@@ -591,7 +632,7 @@ def generate_sample_info_files(self, addl_info=None):
         :param addl_info: A df of (sample-name, project-name) pairs.
         :return: A list of paths to sample-information-files.
         """
-        if self.mapping_file is not None:
+        if self.pipeline_type == Pipeline.AMPLICON_PTYPE:
             # Generate a list of BLANKs for each project.
             temp_df = self.mapping_file[[SAMPLE_NAME_KEY, _PROJECT_NAME_KEY]]
             temp_df_as_dicts_list = temp_df.to_dict(orient='records')
@@ -670,7 +711,7 @@ def get_sample_ids(self):
 
         # test for self.mapping_file, since self.sample_sheet will be
         # defined in both cases.
-        if self.mapping_file is not None:
+        if self.pipeline_type == Pipeline.AMPLICON_PTYPE:
             results = list(self.mapping_file.sample_name)
         else:
             results = [x.Sample_ID for x in self.sample_sheet.samples]
@@ -685,7 +726,7 @@ def get_sample_names(self, project_name=None):
         '''
         # test for self.mapping_file, since self.sample_sheet will be
         # defined in both cases.
-        if self.mapping_file is not None:
+        if self.pipeline_type == Pipeline.AMPLICON_PTYPE:
             return self._get_sample_names_from_mapping_file(project_name)
         else:
             return self._get_sample_names_from_sample_sheet(project_name)
@@ -775,12 +816,9 @@ def _parse_project_name(self, project_name, short_names):
             return proj_info[PROJECT_SHORT_NAME_KEY], proj_info[QIITA_ID_KEY]
 
     def get_project_info(self, short_names=False):
-        # test for self.mapping_file, since self.sample_sheet will be
-        # defined in both cases.
         results = []
 
-        contains_replicates = None
-        if self.mapping_file is not None:
+        if self.pipeline_type == Pipeline.AMPLICON_PTYPE:
             if CONTAINS_REPLICATES_KEY in self.mapping_file:
                 contains_replicates = True
             else:
@@ -792,25 +830,35 @@ def get_project_info(self, short_names=False):
                 {p: parse_project_name(p) for p in sample_project_map}
         else:
             projects_info = self.sample_sheet.get_projects_details()
-        # endif mapping_file
 
         if short_names:
             proj_name_key = PROJECT_SHORT_NAME_KEY
         else:
             proj_name_key = PROJECT_FULL_NAME_KEY
-        # endif
+
         for curr_project_info in projects_info.values():
             curr_dict = {
                 _PROJECT_NAME_KEY: curr_project_info[proj_name_key],
                 QIITA_ID_KEY: curr_project_info[QIITA_ID_KEY]
             }
 
-            if contains_replicates is not None:
+            if self.pipeline_type == Pipeline.AMPLICON_PTYPE:
+                # this is a mapping file:
                 curr_contains_reps = contains_replicates
             else:
-                curr_contains_reps = \
-                    curr_project_info.get(CONTAINS_REPLICATES_KEY, False)
-            # endif
+                bi_df = self.sample_sheet.Bioinformatics
+                if CONTAINS_REPLICATES_KEY in bi_df.columns.tolist():
+                    # subselect rows in [Bioinformatics] based on whether they
+                    # match the project name.
+                    df = bi_df.loc[bi_df['Sample_Project'] ==
+                                   curr_project_info[proj_name_key]]
+                    # since only one project can match by definition, convert
+                    # to dict and extract the needed value.
+                    curr_contains_reps = df.iloc[0].to_dict()[
+                                         CONTAINS_REPLICATES_KEY]
+                else:
+                    curr_contains_reps = False
+
             curr_dict[CONTAINS_REPLICATES_KEY] = curr_contains_reps
             results.append(curr_dict)
         # next project
diff --git a/sequence_processing_pipeline/SeqCountsJob.py b/sequence_processing_pipeline/SeqCountsJob.py
new file mode 100644
index 00000000..f080bd00
--- /dev/null
+++ b/sequence_processing_pipeline/SeqCountsJob.py
@@ -0,0 +1,147 @@
+from os.path import join, split
+from .Job import Job, KISSLoader
+from .PipelineError import JobFailedError
+import logging
+from jinja2 import Environment
+from os import walk
+from json import dumps
+from glob import glob
+
+
+logging.basicConfig(level=logging.DEBUG)
+
+
+class SeqCountsJob(Job):
+    def __init__(self, run_dir, output_path, queue_name,
+                 node_count, wall_time_limit, jmem, modules_to_load,
+                 qiita_job_id, max_array_length, files_to_count_path,
+                 cores_per_task=4):
+        """
+        ConvertJob provides a convenient way to run bcl-convert or bcl2fastq
+        on a directory BCL files to generate Fastq files.
+        :param run_dir: The 'run' directory that contains BCL files.
+        :param output_path: Path where all pipeline-generated files live.
+        :param queue_name: The name of the Torque queue to use for processing.
+        :param node_count: The number of nodes to request.
+        :param wall_time_limit: A hard time limit (in min) to bound processing.
+        :param jmem: String representing total memory limit for entire job.
+        :param modules_to_load: A list of Linux module names to load
+        :param qiita_job_id: identify Torque jobs using qiita_job_id
+        :param max_array_length: A hard-limit for array-sizes
+        :param files_to_count_path: A path to a list of file-paths to count.
+        :param cores_per_task: (Optional) # of CPU cores per node to request.
+        """
+        super().__init__(run_dir,
+                         output_path,
+                         'SeqCountsJob',
+                         [],
+                         max_array_length,
+                         modules_to_load=modules_to_load)
+
+        self.queue_name = queue_name
+        self.node_count = node_count
+        self.wall_time_limit = wall_time_limit
+        self.cores_per_task = cores_per_task
+
+        # raise an Error if jmem is not a valid floating point value.
+        self.jmem = str(int(jmem))
+        self.qiita_job_id = qiita_job_id
+        self.jinja_env = Environment(loader=KISSLoader('templates'))
+
+        self.job_name = (f"seq_counts_{self.qiita_job_id}")
+        self.files_to_count_path = files_to_count_path
+
+        with open(self.files_to_count_path, 'r') as f:
+            lines = f.readlines()
+            lines = [x.strip() for x in lines]
+            lines = [x for x in lines if x != '']
+            self.file_count = len(lines)
+
+    def run(self, callback=None):
+        job_script_path = self._generate_job_script()
+        params = ['--parsable',
+                  f'-J {self.job_name}',
+                  f'--array 1-{self.sample_count}']
+        try:
+            self.job_info = self.submit_job(job_script_path,
+                                            job_parameters=' '.join(params),
+                                            exec_from=None,
+                                            callback=callback)
+
+            logging.debug(f'SeqCountsJob Job Info: {self.job_info}')
+        except JobFailedError as e:
+            # When a job has failed, parse the logs generated by this specific
+            # job to return a more descriptive message to the user.
+            info = self.parse_logs()
+            # prepend just the message component of the Error.
+            info.insert(0, str(e))
+            raise JobFailedError('\n'.join(info))
+
+        self._aggregate_counts()
+
+        logging.debug(f'SeqCountJob {self.job_info["job_id"]} completed')
+
+    def _generate_job_script(self):
+        job_script_path = join(self.output_path, "seq_counts.sbatch")
+        template = self.jinja_env.get_template("seq_counts.sbatch")
+
+        #  got to make files_to_count.txt and put it in the output directory
+
+        with open(job_script_path, mode="w", encoding="utf-8") as f:
+            f.write(template.render({
+                "job_name": "seq_counts",
+                "wall_time_limit": self.wall_time_limit,
+                "mem_in_gb": self.jmem,
+                "node_count": self.node_count,
+                "cores_per_task": self.cores_per_task,
+                "queue_name": self.queue_name,
+                "file_count": self.file_count,
+                "output_path": self.output_path
+            }))
+
+        return job_script_path
+
+    def parse_logs(self):
+        # overrides Job.parse_logs() w/tailored parse for specific logs.
+        files = sorted(glob(join(self.log_path, '*.err')))
+        msgs = []
+
+        for some_file in files:
+            with open(some_file, 'r') as f:
+                msgs += [line for line in f.readlines()
+                         if line.startswith("[E::stk_size]")]
+
+        return [msg.strip() for msg in msgs]
+
+    def _aggregate_counts(self):
+        def extract_metadata(fp):
+            with open(fp, 'r') as f:
+                lines = f.readlines()
+                lines = [x.strip() for x in lines]
+                if len(lines) != 2:
+                    raise ValueError("error processing %s" % fp)
+                _dir, _file = split(lines[0])
+                seq_counts, base_pairs = lines[1].split('\t')
+                return _dir, _file, int(seq_counts), int(base_pairs)
+
+        results = {}
+
+        for root, dirs, files in walk(self.log_path):
+            for _file in files:
+                if _file.endswith('.out'):
+                    log_output_file = join(root, _file)
+                    _dir, _file, seq_counts, base_pairs = \
+                        extract_metadata(log_output_file)
+
+                    if _dir not in results:
+                        results[_dir] = {}
+
+                    results[_dir][_file] = {'seq_counts': seq_counts,
+                                            'base_pairs': base_pairs}
+
+        results_path = join(self.output_path, 'aggregate_counts.json')
+
+        with open(results_path, 'w') as f:
+            print(dumps(results, indent=2), file=f)
+
+        return results_path
diff --git a/sequence_processing_pipeline/TRIntegrateJob.py b/sequence_processing_pipeline/TRIntegrateJob.py
new file mode 100644
index 00000000..7b8740b4
--- /dev/null
+++ b/sequence_processing_pipeline/TRIntegrateJob.py
@@ -0,0 +1,163 @@
+from os.path import join
+from .Job import Job, KISSLoader
+from .PipelineError import JobFailedError
+import logging
+from jinja2 import Environment
+from .Pipeline import Pipeline
+from .PipelineError import PipelineError
+from metapool import load_sample_sheet
+from os import makedirs
+from shutil import copyfile
+
+
+logging.basicConfig(level=logging.DEBUG)
+
+
+class TRIntegrateJob(Job):
+    def __init__(self, run_dir, output_path, sample_sheet_path, queue_name,
+                 node_count, wall_time_limit, jmem, modules_to_load,
+                 qiita_job_id, integrate_script_path, sil_path, raw_fastq_dir,
+                 reference_base, reference_map, cores_per_task):
+        """
+        ConvertJob provides a convenient way to run bcl-convert or bcl2fastq
+        on a directory BCL files to generate Fastq files.
+        :param run_dir: The 'run' directory that contains BCL files.
+        :param output_path: Path where all pipeline-generated files live.
+        :param sample_sheet_path: The path to a sample-sheet.
+        :param queue_name: The name of the Torque queue to use for processing.
+        :param node_count: The number of nodes to request.
+        :param wall_time_limit: A hard time limit (in min) to bound processing.
+        :param jmem: String representing total memory limit for entire job.
+        :param modules_to_load: A list of Linux module names to load
+        :param qiita_job_id: identify Torque jobs using qiita_job_id
+        :param integrate_script_path: None
+        :param sil_path: A path to a confidential file mapping C5xx, adapters.
+        :param reference_base: None
+        :param reference_map: None
+        :param cores_per_task: # of CPU cores per node to request.
+        """
+        super().__init__(run_dir,
+                         output_path,
+                         'TRIntegrateJob',
+                         [],
+                         # max_array_length and self.max_array_length are
+                         # not used by TRIntegrateJob.
+                         -1,
+                         modules_to_load=modules_to_load)
+
+        self.sample_sheet_path = sample_sheet_path
+        self._file_check(self.sample_sheet_path)
+        metadata = self._process_sample_sheet()
+        self.sample_ids = metadata['sample_ids']
+        self.queue_name = queue_name
+        self.node_count = node_count
+        self.wall_time_limit = wall_time_limit
+        self.cores_per_task = cores_per_task
+        self.integrate_script_path = integrate_script_path
+        self.sil_path = sil_path
+        self.raw_fastq_dir = raw_fastq_dir
+        self.tmp_dir = join(self.output_path, 'tmp')
+
+        self.reference_base = reference_base
+        self.reference_map = reference_map
+
+        # raise an Error if jmem is not a valid floating point value.
+        self.jmem = str(int(jmem))
+        self.qiita_job_id = qiita_job_id
+        self.sample_count = len(self.sample_ids)
+        self.jinja_env = Environment(loader=KISSLoader('templates'))
+        self.job_name = (f"integrate_{self.qiita_job_id}")
+
+        with open(self.sil_path, 'r') as f:
+            # obtain the number of unique barcode_ids as determined by
+            # TellReadJob() in order to set up an array job of the
+            # proper length.
+            lines = f.readlines()
+            lines = [x.strip() for x in lines]
+            lines = [x for x in lines if x != '']
+            self.barcode_id_count = len(lines)
+
+    def run(self, callback=None):
+        job_script_path = self._generate_job_script()
+
+        # copy sil_path to TRIntegrate working directory and rename to a
+        # predictable name.
+        copyfile(self.sil_path,
+                 join(self.output_path, 'sample_index_list.txt'))
+
+        # generate the tailored subset of adapter to barcode_id based on
+        # the proprietary lists owned by the manufacturer and supplied by
+        # the caller, and the barcode ids found in the sample-sheet.
+        self._generate_sample_index_list()
+
+        makedirs(self.tmp_dir)
+
+        params = ['--parsable',
+                  f'-J {self.job_name}',
+                  f'--array 1-{self.sample_count}']
+        try:
+            self.job_info = self.submit_job(job_script_path,
+                                            job_parameters=' '.join(params),
+                                            exec_from=None,
+                                            callback=callback)
+
+            logging.debug(f'TRIntegrateJob Job Info: {self.job_info}')
+        except JobFailedError as e:
+            # When a job has failed, parse the logs generated by this specific
+            # job to return a more descriptive message to the user.
+            info = self.parse_logs()
+            # prepend just the message component of the Error.
+            info.insert(0, str(e))
+            raise JobFailedError('\n'.join(info))
+
+        logging.debug(f'TRIntegrateJob {self.job_info["job_id"]} completed')
+
+    def _process_sample_sheet(self):
+        sheet = load_sample_sheet(self.sample_sheet_path)
+
+        if not sheet.validate_and_scrub_sample_sheet():
+            s = "Sample sheet %s is not valid." % self.sample_sheet_path
+            raise PipelineError(s)
+
+        header = sheet.Header
+        chemistry = header['chemistry']
+
+        if header['Assay'] not in Pipeline.assay_types:
+            s = "Assay value '%s' is not recognized." % header['Assay']
+            raise PipelineError(s)
+
+        sample_ids = []
+        for sample in sheet.samples:
+            sample_ids.append((sample['Sample_ID'], sample['Sample_Project']))
+
+        bioinformatics = sheet.Bioinformatics
+
+        # reorganize the data into a list of dictionaries, one for each row.
+        # the ordering of the rows will be preserved in the order of the list.
+        lst = bioinformatics.to_dict('records')
+
+        # human-filtering jobs are scoped by project. Each job requires
+        # particular knowledge of the project.
+        return {'chemistry': chemistry,
+                'projects': lst,
+                'sample_ids': sample_ids}
+
+    def _generate_job_script(self):
+        job_script_path = join(self.output_path, 'integrate_test.sbatch')
+        template = self.jinja_env.get_template("integrate.sbatch")
+
+        with open(job_script_path, mode="w", encoding="utf-8") as f:
+            f.write(template.render({
+                "job_name": "integrate",
+                "wall_time_limit": self.wall_time_limit,
+                "mem_in_gb": self.jmem,
+                "node_count": self.node_count,
+                "cores_per_task": self.cores_per_task,
+                "integrate_script_path": self.integrate_script_path,
+                "queue_name": self.queue_name,
+                "barcode_id_count": self.barcode_id_count,
+                "raw_fastq_dir": self.raw_fastq_dir,
+                "tmp_dir": self.tmp_dir,
+                "output_dir": self.output_path}))
+
+        return job_script_path
diff --git a/sequence_processing_pipeline/TellReadJob.py b/sequence_processing_pipeline/TellReadJob.py
new file mode 100644
index 00000000..3d68d4c8
--- /dev/null
+++ b/sequence_processing_pipeline/TellReadJob.py
@@ -0,0 +1,174 @@
+from os.path import join
+from .Job import Job, KISSLoader
+from .PipelineError import JobFailedError
+import logging
+from jinja2 import Environment
+from .Pipeline import Pipeline
+from .PipelineError import PipelineError
+from metapool import load_sample_sheet
+
+
+logging.basicConfig(level=logging.DEBUG)
+
+
+class TellReadJob(Job):
+    def __init__(self, run_dir, output_path, sample_sheet_path, queue_name,
+                 node_count, wall_time_limit, jmem, modules_to_load,
+                 qiita_job_id, reference_base,
+                 reference_map, sing_script_path, cores_per_task):
+        """
+        ConvertJob provides a convenient way to run bcl-convert or bcl2fastq
+        on a directory BCL files to generate Fastq files.
+        :param run_dir: The 'run' directory that contains BCL files.
+        :param output_path: Path where all pipeline-generated files live.
+        :param sample_sheet_path: The path to a sample-sheet.
+        :param queue_name: The name of the Torque queue to use for processing.
+        :param node_count: The number of nodes to request.
+        :param wall_time_limit: A hard time limit (in min) to bound processing.
+        :param jmem: String representing total memory limit for entire job.
+        :param modules_to_load: A list of Linux module names to load
+        :param qiita_job_id: identify Torque jobs using qiita_job_id
+        :param reference_base: None
+        :param reference_map: None
+        :param cores_per_task: (Optional) # of CPU cores per node to request.
+        """
+        super().__init__(run_dir,
+                         output_path,
+                         'TellReadJob',
+                         [],
+                         1,
+                         modules_to_load=modules_to_load)
+
+        self.sample_sheet_path = sample_sheet_path
+        self._file_check(self.sample_sheet_path)
+        metadata = self._process_sample_sheet()
+        self.sample_ids = metadata['sample_ids']
+        self.queue_name = queue_name
+        self.node_count = node_count
+        self.wall_time_limit = wall_time_limit
+        self.cores_per_task = cores_per_task
+
+        self.reference_base = reference_base
+        self.reference_map = reference_map
+
+        # raise an Error if jmem is not a valid floating point value.
+        self.jmem = str(int(jmem))
+        self.qiita_job_id = qiita_job_id
+        self.jinja_env = Environment(loader=KISSLoader('templates'))
+        self.sing_script_path = sing_script_path
+
+        sheet = load_sample_sheet(self.sample_sheet_path)
+        lane = sheet.samples[0].Lane
+
+        # force self.lane_number to be int. raise an Error if it's not.
+        tmp = int(lane)
+        if tmp < 1 or tmp > 8:
+            raise ValueError(f"'{tmp}' is not a valid lane number")
+        self.lane_number = tmp
+
+        self.job_name = (f"{self.qiita_job_id}-tellread")
+
+    def run(self, callback=None):
+        job_script_path = self._generate_job_script()
+
+        # everything is in the job script so there are no additional params.
+        params = []
+
+        try:
+            self.job_info = self.submit_job(job_script_path,
+                                            job_parameters=' '.join(params),
+                                            exec_from=None,
+                                            callback=callback)
+
+            logging.debug(f'TellReadJob Job Info: {self.job_info}')
+        except JobFailedError as e:
+            # When a job has failed, parse the logs generated by this specific
+            # job to return a more descriptive message to the user.
+            # TODO: We need more examples of failed jobs before we can create
+            #  a parser for the logs.
+            # info = self.parse_logs()
+            # prepend just the message component of the Error.
+            # info.insert(0, str(e))
+            info = str(e)
+            raise JobFailedError('\n'.join(info))
+
+        logging.debug(f'TellReadJob {self.job_info["job_id"]} completed')
+
+    def _process_sample_sheet(self):
+        sheet = load_sample_sheet(self.sample_sheet_path)
+
+        if not sheet.validate_and_scrub_sample_sheet():
+            s = "Sample sheet %s is not valid." % self.sample_sheet_path
+            raise PipelineError(s)
+
+        header = sheet.Header
+        chemistry = header['chemistry']
+
+        if header['Assay'] not in Pipeline.assay_types:
+            s = "Assay value '%s' is not recognized." % header['Assay']
+            raise PipelineError(s)
+
+        sample_ids = []
+        for sample in sheet.samples:
+            sample_ids.append((sample['Sample_ID'],
+                               sample['Sample_Project'],
+                               sample['barcode_id']))
+
+        bioinformatics = sheet.Bioinformatics
+
+        # reorganize the data into a list of dictionaries, one for each row.
+        # the ordering of the rows will be preserved in the order of the list.
+        lst = bioinformatics.to_dict('records')
+
+        # human-filtering jobs are scoped by project. Each job requires
+        # particular knowledge of the project.
+        return {'chemistry': chemistry,
+                'projects': lst,
+                'sample_ids': sample_ids}
+
+    def _generate_job_script(self):
+        job_script_path = join(self.output_path, 'tellread_test.sbatch')
+        template = self.jinja_env.get_template("tellread.sbatch")
+
+        # generate a comma separated list of sample-ids from the tuples stored
+        # in self.sample_ids.
+
+        # NB: Proposed sample-sheets will have traditional Sample_ID and
+        # Sample_Name columns as well as a new value named barcode_id. It's
+        # this column that will contain the 'C50n' values needed to be
+        # supplied to tellread. Later we will use this mapping to rename the
+        # files from C50n...fastq.gz to sample-name...fastq.gz.
+        samples = ','.join([id[2] for id in self.sample_ids])
+
+        # since we haven't included support for reference_map yet, whenever a
+        # reference is not included, the mapping against the list of sample_ids
+        # is ['NONE', 'NONE', ..., 'NONE'].
+        refs = ','.join(['NONE' for _ in self.sample_ids])
+
+        extra = ""
+
+        with open(job_script_path, mode="w", encoding="utf-8") as f:
+            f.write(template.render({
+                "job_name": "tellread",
+                "wall_time_limit": self.wall_time_limit,
+                "mem_in_gb": self.jmem,
+                "node_count": self.node_count,
+                "cores_per_task": self.cores_per_task,
+                "queue_name": self.queue_name,
+                "sing_script_path": self.sing_script_path,
+                "modules_to_load": ' '.join(self.modules_to_load),
+                "lane": f"s_{self.lane_number}",
+                # NB: Note that we no longer create a sub-directory under the
+                # working directory for TellRead to create all its output
+                # folders and files. This means it is creating folders and
+                # files in the same directory that has our sbatch script and
+                # logs directory. Currently there are no name collisions,
+                # however.
+                "output": self.output_path,
+                "rundir_path": self.root_dir,
+                "samples": samples,
+                "refs": refs,
+                "extra": extra
+            }))
+
+        return job_script_path
diff --git a/sequence_processing_pipeline/aggregate_counts.py b/sequence_processing_pipeline/aggregate_counts.py
new file mode 100644
index 00000000..ace90212
--- /dev/null
+++ b/sequence_processing_pipeline/aggregate_counts.py
@@ -0,0 +1,40 @@
+from os import walk
+from sys import argv
+from os.path import join, split
+from json import dumps
+
+
+def extract_metadata(log_output_file_path):
+    with open(log_output_file_path, 'r') as f:
+        lines = f.readlines()
+        lines = [x.strip() for x in lines]
+        if len(lines) != 2:
+            raise ValueError("error processing %s" % log_output_file_path)
+        _dir, _file = split(lines[0])
+        seq_counts, base_pairs = lines[1].split('\t')
+        return _dir, _file, int(seq_counts), int(base_pairs)
+
+
+def aggregate_counts(fp):
+    results = {}
+
+    for root, dirs, files in walk(fp):
+        for _file in files:
+            if _file.endswith('.out'):
+                log_output_file = join(root, _file)
+                _dir, _file, seq_counts, base_pairs = \
+                    extract_metadata(log_output_file)
+
+                if _dir not in results:
+                    results[_dir] = {}
+
+                results[_dir][_file] = {'seq_counts': seq_counts,
+                                        'base_pairs': base_pairs}
+
+    return results
+
+
+if __name__ == '__main__':
+    results = aggregate_counts(argv[1])
+    with open(argv[2], 'w') as f:
+        print(dumps(results, indent=2), file=f)
diff --git a/sequence_processing_pipeline/contrib/create_picklist.py b/sequence_processing_pipeline/contrib/create_picklist.py
new file mode 100644
index 00000000..a1d6a1d0
--- /dev/null
+++ b/sequence_processing_pipeline/contrib/create_picklist.py
@@ -0,0 +1,72 @@
+import os
+# from metapool.metapool import *
+from sys import argv
+import pandas as pd
+import matplotlib.pyplot as plt
+from metapool.metapool import (read_survival, make_2D_array,
+                               calculate_iseqnorm_pooling_volumes,
+                               format_pooling_echo_pick_list)
+import seaborn as sns
+
+input_sheet_filename = argv[1]
+
+plate_df_w_reads = pd.read_csv(input_sheet_filename, sep='\t')
+plate_df_w_reads['Blank'] = [True if 'blank' in s.lower() else False
+                             for s in plate_df_w_reads['Sample_Name']]
+reads_column = 'read_counts'
+
+well_col = 'Sample_Well'
+assert reads_column in plate_df_w_reads.columns
+
+f, ((ax1, ax2), (ax3, ax4)) = plt.subplots(nrows=2, ncols=2, figsize=(8, 8))
+# evenness plot
+rmax = int(round(plate_df_w_reads[reads_column].max(), -2))
+
+foo = read_survival(plate_df_w_reads.loc[plate_df_w_reads['Blank'] is True,
+                                         reads_column],
+                    label='Blanks',
+                    rmax=rmax)
+
+bar = read_survival(plate_df_w_reads.loc[plate_df_w_reads['Blank'] is False,
+                                         reads_column],
+                    label='Samples',
+                    rmax=rmax)
+
+survival_df = pd.concat([foo, bar])
+
+ax3.set_xlabel(reads_column)
+ax3.set_ylabel('Samples')
+survival_df.plot(color=['coral', 'steelblue'], ax=ax1)
+ax1.set_xlabel(reads_column)
+ax1.set_ylabel('Samples')
+
+# Histogram
+sns.histplot(plate_df_w_reads[reads_column], ax=ax3)
+
+# Boxplot
+sns.boxplot(x="Blank", y=reads_column, data=plate_df_w_reads, ax=ax4)
+sns.stripplot(x="Blank", y=reads_column, data=plate_df_w_reads, ax=ax4,
+              size=3, color='black', alpha=0.5)
+
+plt.tight_layout()
+plt.savefig(input_sheet_filename + '.comboplot.pdf')
+
+pdfn = calculate_iseqnorm_pooling_volumes(plate_df_w_reads,
+                                          dynamic_range=20,
+                                          normalization_column=reads_column)
+plt.savefig(input_sheet_filename + '.normalizedplot.pdf')
+
+vols = make_2D_array(pdfn,
+                     data_col='iSeq normpool volume',
+                     well_col=well_col).astype(float)
+
+# Write the picklist as .csv
+picklist_fp = input_sheet_filename + '.picklist.csv'
+
+if os.path.isfile(picklist_fp):
+    print("Warning! This file exists already.")
+
+picklist = format_pooling_echo_pick_list(vols, max_vol_per_well=30000)
+
+with open(picklist_fp, 'w') as f:
+    f.write(picklist)
diff --git a/sequence_processing_pipeline/contrib/integrate-indices-np.py b/sequence_processing_pipeline/contrib/integrate-indices-np.py
new file mode 100644
index 00000000..b1be83a6
--- /dev/null
+++ b/sequence_processing_pipeline/contrib/integrate-indices-np.py
@@ -0,0 +1,332 @@
+# Why
+# 1) cloudspades requires the index reads be inline in the record header
+# 2) Ariadne requires the data are sorted by the barcodes
+#
+# Inlining is easy. Sorting is complex as the amount of data is large, and
+# the ordering stems is determined external to the data being sorted. To
+# determine order, all barcodes must be read in to gather the complete
+# barcode <-> record association; if only partial data is read then
+# associations to barcodes may be missed, and we cannot perform an insertion
+# sort efficiently as we're writing to disk. Once we know an order for the
+# records, we (currently) read in the entirety of the subsequent data (R1 then
+# R2), reorder, and write. Performing this in blocks to minimize memory may be
+# possible, but we have to assume access is random as a grouping barcode
+# may be with any record along the file.
+#
+# A variety of approaches were considered, including:
+# - indexing portions in a hashtable, reading inputs multiple times, and
+#   writing in blocks. This was tested in both rust and python. The amount of
+#   memory was large, and keeping it under control would be many many many
+#   passes over data on disk or in memory
+# - using pandas to do the grouping, which possibly avoids the memory burden
+#   of a hashmap. it didn't
+# - using mmap files. No go, these are large and we have to walk over them
+#   a lot.
+#
+# Parsing this stuff adds a lot of overhead in Python. It will add some, if not
+# a lot, in rust as well -- our test data had 65M sequences. So the current
+# approach operates in the raw file data itself, using regex's to parse
+# individual records. We use numpy for sorting and getting record orders.
+# This is memory expensive but so far much less than the other approaches tried
+# and it does not require multiple passes over files. We bottleneck on write
+# IO, so to mitigate that, we are using a parallel gzip (pgzip), which still
+# bottlenecks but gets better throughput.
+#
+# There probably are smarter ways to do this to reduce the memory burden.
+# Right now, it's O(N) where N is the number of records. We load R1 and R2
+# separately though so we at least halve the memory use. As for doing it
+# faster, at the moment we appear to saturate time on gzip. Easiest solution
+# would be to increase the number of threads, but then again, this process
+# is expected to run in an array, and filesystem can only take so much.
+#
+# In addition to the inline tests, md5 checks to verify all record IDs are
+# present in both R1 / R2, and relative to original input. Spot checks on
+# an arbitrary set of records were performed on R1 / R2 to verify no apparent
+# unusual modification. And spot checks were performed to verify that correct
+# barcodes are incorporating as expected in output.
+#
+# author: Daniel McDonald (d3mcdonald@eng.ucsd.edu)
+import numpy as np
+import click
+import re
+import io
+import pgzip
+import gzip
+
+
+RECORD = re.compile(rb'@\S+\n[ATGCN]+\n\+\n\S+\n')
+BARCODE = re.compile(rb'@\S+\n([ATGCN]+)\n\+\n\S+\n')
+
+
+def gather_order(i1_in_fp):
+    """Determine record order
+
+    This is a fancy way of saying: get all the barcodes, and sort them.
+
+    We return the order of the sorted records, the unique barcodes,
+    and the bounds for what barcode associated with what record
+    """
+    # determine barcode length
+    _ = i1_in_fp.readline()
+    b = i1_in_fp.readline()
+    rec_len = len(b.strip())
+    i1_in_fp.seek(0)
+
+    # we need larger data in memory later anyway...
+    i1 = i1_in_fp.read()
+    start = 0
+    end = len(i1)
+
+    # get the number of records. we completely assume non-multiline fastq here
+    newlines = i1.count(b'\n')
+    assert newlines % 4 == 0
+    barcodes = np.empty(newlines // 4, dtype='|S%d' % rec_len)
+
+    # walk all index records
+    # grab each barcode
+    idx = 0
+    while start < end:
+        barcode_result = BARCODE.search(i1, pos=start)
+        barcode = barcode_result.groups()[0]
+        assert len(barcode) == rec_len  # get angry if the barcode is weird
+
+        barcodes[idx] = barcode
+        idx += 1
+        start = barcode_result.end()
+
+    # we no longer need the raw data so let's toss it
+    del i1
+
+    # determine the record order of a lexicographic sort
+    # gather the unique barcodes so we can use them later, and the bounding
+    # points in the sorted set
+    record_order = barcodes.argsort()
+    barcodes = barcodes[record_order]
+    unique_barcodes, barcode_bounds = np.unique(barcodes, return_index=True)
+
+    return record_order, unique_barcodes, barcode_bounds
+
+
+def test_gather_order():
+    i1data = [b'@foo', b'ATGC', b'+', b'!!!!',
+              b'@bar', b'TTGG', b'+', b'!!!!',
+              b'@baz', b'ATGC', b'+', b'!!!!',
+              b'@oof', b'TTTT', b'+', b'!!!!',
+              b'@rab', b'TTGG', b'+', b'!!!!',
+              b'@zab', b'TTTT', b'+', b'!!!!',
+              b'@ofo', b'TTTT', b'+', b'!!!!', b'']
+
+    i1 = io.BytesIO(b'\n'.join(i1data))
+    order, unique, bounds = gather_order(i1)
+
+    exp_order = np.array([0, 2, 1, 4, 3, 5, 6])
+    exp_unique = np.array([b'ATGC', b'TTGG', b'TTTT'])
+    exp_bounds = np.array([0, 2, 4])
+
+    assert (order == exp_order).all()
+    assert (unique == exp_unique).all()
+    assert (bounds == exp_bounds).all()
+
+
+def troll_and_write(order, unique, bounds, in_, out_):
+    """Walk over the raw data, spit out barcode amended records in order
+
+    - read all data
+    - get index boundaries for each record
+    - pull out each record in order according to the barcode data
+    - associate the barcode
+    - write
+    """
+
+    data = in_.read()
+    boundaries = np.empty([order.size, 2], dtype=np.uint64)
+
+    stop = 0
+    for idx in range(order.size):
+        rec = RECORD.search(data, pos=stop)
+        start, stop = rec.span()
+        boundaries[idx] = np.array([start, stop], dtype=np.uint64)
+
+    current_barcode_idx = 0
+    current_barcode = unique[current_barcode_idx]
+    current_barcode_bound_end = bounds[current_barcode_idx + 1]
+
+    for order_idx, record_idx in enumerate(order):
+        if order_idx >= current_barcode_bound_end:
+            current_barcode_idx += 1
+
+            if current_barcode_idx >= bounds.size:
+                raise ValueError("should not happen?")
+            current_barcode = unique[current_barcode_idx]
+
+            if current_barcode_idx + 1 >= bounds.size:
+                # run to the end
+                current_barcode_bound_end = order.size
+            else:
+                current_barcode_bound_end = bounds[current_barcode_idx + 1]
+
+        start, stop = boundaries[record_idx]
+        record = data[start:stop]
+
+        # in a one-off, these might pass by chance. It would be real weird
+        # for them to always pass for all records in a large file.
+        # n.b., b'foo'[0] is int, because yay, so we use a slice to maintain
+        # a human readable character to test against as most mortals haven't
+        # memorized the ascii table
+        assert record[:1] == b'@'
+        assert record[-1:] == b'\n'
+
+        with_barcode = insert_barcode(record, current_barcode)
+        out_.write(with_barcode)
+
+
+def test_troll_and_write():
+    i1data = [b'@foo', b'ATGC', b'+', b'!!!!',
+              b'@bar', b'TTGG', b'+', b'!!!!',
+              b'@baz', b'ATGC', b'+', b'!!!!',
+              b'@oof', b'TTTT', b'+', b'!!!!',
+              b'@rab', b'TTGG', b'+', b'!!!!',
+              b'@zab', b'TTTT', b'+', b'!!!!',
+              b'@ofo', b'TTTT', b'+', b'!!!!', b'']
+
+    i1 = io.BytesIO(b'\n'.join(i1data))
+    order, unique, bounds = gather_order(i1)
+
+    # we assume records are in the same order, as that has previously been
+    # observed w/ tellread and is the normal expectation
+    r1data = [b'@foo', b'AATGC', b'+', b'!!!!!',
+              b'@bar', b'ATTGG', b'+', b'!!!!!',
+              b'@baz', b'AATGC', b'+', b'!!!!!',
+              b'@oof', b'ATTTT', b'+', b'!!!!!',
+              b'@rab', b'ATTGG', b'+', b'!!!!!',
+              b'@zab', b'ATTTT', b'+', b'!!!!!',
+              b'@ofo', b'ATTTT', b'+', b'!!!!!', b'']
+    r1 = io.BytesIO(b'\n'.join(r1data))
+    r1out = io.BytesIO()
+    troll_and_write(order, unique, bounds, r1, r1out)
+    r1out.seek(0)
+
+    r1exp = [b'@foo BX:Z:ATGC-1', b'AATGC', b'+', b'!!!!!',
+             b'@baz BX:Z:ATGC-1', b'AATGC', b'+', b'!!!!!',
+             b'@bar BX:Z:TTGG-1', b'ATTGG', b'+', b'!!!!!',
+             b'@rab BX:Z:TTGG-1', b'ATTGG', b'+', b'!!!!!',
+             b'@oof BX:Z:TTTT-1', b'ATTTT', b'+', b'!!!!!',
+             b'@zab BX:Z:TTTT-1', b'ATTTT', b'+', b'!!!!!',
+             b'@ofo BX:Z:TTTT-1', b'ATTTT', b'+', b'!!!!!',
+             b'']
+    r1exp = b'\n'.join(r1exp)
+    assert r1exp == r1out.read()
+
+
+def create_tag(t):
+    return b'BX:Z:%s-1' % t
+
+
+def create_tag_no_suffix(t):
+    return b'BX:Z:%s' % t
+
+
+def insert_barcode(record, barcode):
+    """Get the current ID, smash the needed tag in"""
+    # @foo\nATGC\n+\n!!!!\n
+    id_, remainder = record.split(b'\n', 1)
+    tag = create_tag(barcode)
+    return b'%s %s\n%s' % (id_, tag, remainder)
+
+
+def readfq(fp):
+    if fp.mode == 'rb':
+        strip = bytes.strip
+    else:
+        strip = str.strip
+
+    id_ = iter(fp)
+    seq = iter(fp)
+    dumb = iter(fp)
+    qual = iter(fp)
+    for rec in zip(id_, seq, dumb, qual):
+        yield list(map(strip, rec))
+
+
+def writefq(rec, out):
+    for item in rec:
+        out.write(item)
+        out.write(b'\n')
+
+
+@click.group()
+def cli():
+    pass
+
+
+@cli.command()
+def tests():
+    test_gather_order()
+    test_troll_and_write()
+
+
+@cli.command()
+@click.option('--r1-in', type=click.Path(exists=True), required=True)
+@click.option('--r2-in', type=click.Path(exists=True), required=True)
+@click.option('--i1-in', type=click.Path(exists=True), required=True)
+@click.option('--r1-out', type=click.Path(exists=False), required=True)
+@click.option('--r2-out', type=click.Path(exists=False), required=True)
+@click.option('--threads', type=int, required=False, default=1)
+@click.option('--no-sort', is_flag=True, default=False)
+def integrate(r1_in, r2_in, i1_in, r1_out, r2_out, threads, no_sort):
+    r1_in_fp = open(r1_in, 'rb')
+    r2_in_fp = open(r2_in, 'rb')
+    i1_in_fp = open(i1_in, 'rb')
+
+    if no_sort:
+        r1_out_fp = gzip.open(r1_out, mode='wb')
+        r2_out_fp = gzip.open(r2_out, mode='wb')
+
+        r1_sniff = r1_in_fp.readline().strip()
+        r2_sniff = r2_in_fp.readline().strip()
+        r1_in_fp.seek(0)
+        r2_in_fp.seek(0)
+
+        # outputs from tellread don't seem to have orientation information
+        # some downstream programs hate this, so let's add if needed.
+        if r1_sniff.endswith(b'/1'):
+            if not r2_sniff.endswith(b'/2'):
+                raise ValueError('unexpected endings: '
+                                 f'{r1_sniff.decode("utf-8")} '
+                                 f'{r2_sniff.decode("utf-8")}')
+            orient_r1 = ''
+            orient_r2 = ''
+        else:
+            assert b'/1' not in r1_sniff
+
+            orient_r1 = b'/1'
+            orient_r2 = b'/2'
+
+        for (r1, r2, i1) in zip(*map(readfq, [r1_in_fp, r2_in_fp, i1_in_fp])):
+            assert r1[0] == r2[0]
+            assert r1[0] == i1[0]
+
+            tag = create_tag_no_suffix(i1[1])
+            r1[0] = b"%s%s %s" % (r1[0], orient_r1, tag)
+            r2[0] = b"%s%s %s" % (r2[0], orient_r2, tag)
+            writefq(r1, r1_out_fp)
+            writefq(r2, r2_out_fp)
+        r1_out_fp.close()
+        r2_out_fp.close()
+    else:
+        # 200MB is what they use in their readme...
+        r1_out_fp = pgzip.open(r1_out, mode='wb', thread=threads,
+                               blocksize=2*10**8)
+        r2_out_fp = pgzip.open(r2_out, mode='wb', thread=threads,
+                               blocksize=2*10**8)
+
+        order, unique, bounds = gather_order(i1_in_fp)
+
+        for in_, out_ in zip([r1_in_fp, r2_in_fp], [r1_out_fp, r2_out_fp]):
+            troll_and_write(order, unique, bounds, in_, out_)
+            in_.close()
+            out_.close()
+
+
+if __name__ == '__main__':
+    cli()
diff --git a/sequence_processing_pipeline/contrib/plot_counts.py b/sequence_processing_pipeline/contrib/plot_counts.py
new file mode 100644
index 00000000..ecab9e49
--- /dev/null
+++ b/sequence_processing_pipeline/contrib/plot_counts.py
@@ -0,0 +1,27 @@
+import matplotlib.pyplot as plt
+import re
+import sys
+import os
+import pandas as pd
+
+ex = re.compile(r'_I1_(C5\d\d).fastq.gz.corrected.err_barcode_removed.fastq')
+
+# remove total line from wc
+data = [x.strip().split(' ') for x in open(sys.argv[1])][:-1]
+plotdata = [(ex.search(i).groups()[0], int(v) / 4) for v, i in data]
+sheetdata = dict(plotdata)
+
+ordered = sorted(plotdata, key=lambda x: x[1])
+f = plt.figure(figsize=(16, 8))
+plt.bar([i for i, _ in ordered], [v for _, v in ordered])
+plt.ylabel('I1 reads')
+plt.xticks(list(range(len(ordered))), [i for i, _ in ordered], rotation=90)
+plt.savefig(sys.argv[3] + '/counts.pdf')
+
+sheet = pd.read_csv(sys.argv[2], dtype=str)
+sheet = sheet[~sheet['Lane'].isnull()]
+sheet['read_counts'] = [sheetdata[i] for i in sheet['Barcode_ID']]
+name = os.path.basename(sys.argv[2]).rsplit('.', 1)[0]
+newname = name + '.read_counts.tsv'
+
+sheet.to_csv(sys.argv[3] + '/' + newname, sep='\t', index=False, header=True)
diff --git a/sequence_processing_pipeline/scripts/fake_squeue.py b/sequence_processing_pipeline/scripts/fake_squeue.py
new file mode 100755
index 00000000..6c8511ce
--- /dev/null
+++ b/sequence_processing_pipeline/scripts/fake_squeue.py
@@ -0,0 +1,101 @@
+#!/usr/bin/env python
+from json import load, dumps
+from os.path import exists, join
+from sys import argv
+from random import randint, choice
+
+
+def print_state(state):
+    # Note that %i will appear w/column name 'JOBID' in actual squeue output.
+    # this is because %i shows the array-id if it's an array job and what we
+    # consider the regular job-id if it's not an array job.
+    print("JOBID,STATE")
+    for job_id in state:
+        if 'array_ids' in state[job_id]:
+            # this is an array job
+            for array_id in state[job_id]['array_ids']:
+                if state[job_id]['array_ids'][array_id] <= 0:
+                    end_state = state[job_id]['endgame'][array_id]
+                else:
+                    end_state = 'RUNNING'
+
+                print(f"{array_id},{end_state}")
+        else:
+            # this is a non-array job
+            if state[job_id]['countdown'] <= 0:
+                end_state = state[job_id]['endgame']
+            else:
+                end_state = 'RUNNING'
+
+            print(f"{job_id},{end_state}")
+
+
+def generate_output(job_ids):
+    results = {}
+
+    for job_id in job_ids:
+        is_successful = choice([True, False])
+        is_array_job = choice([True, False])
+
+        if is_array_job:
+            result = {'job_id': job_id}
+            result['array_ids'] = {}
+            result['endgame'] = {}
+
+            for i in range(0, randint(5, 15)):
+                array_id = "%s_%d" % (job_id, i)
+                result['array_ids'][array_id] = randint(3, 7)
+                result['array_ids'][array_id] = randint(3, 7)
+                if is_successful:
+                    # all array jobs must be successful
+                    result['endgame'][array_id] = "COMPLETED"
+                else:
+                    # some jobs may succeed but some may fail
+                    result['endgame'][array_id] = choice(
+                        ['COMPLETED', 'FAILED'])
+            results[job_id] = result
+        else:
+            result = {'job_id': job_id}
+            result['countdown'] = randint(3, 7)
+            result['endgame'] = choice(['COMPLETED', 'FAILED'])
+            results[job_id] = result
+
+    return results
+
+
+def save_state(state, file_path):
+    with open(file_path, 'w') as f:
+        print(dumps(state, indent=2), file=f)
+
+
+def load_state(file_path):
+    with open(file_path, 'r') as f:
+        return load(f)
+
+
+if __name__ == "__main__":
+    # "squeue -t all -j " f"{','.join(job_ids)} " "-o '%i,%T'"
+    job_ids = argv[4].split(',')
+
+    state_file_path = join("sequence_processing_pipeline", "scripts",
+                           "my_state.json")
+
+    state = generate_output(job_ids)
+
+    if exists(state_file_path):
+        state = load_state(state_file_path)
+    else:
+        state = generate_output(job_ids)
+
+    print_state(state)
+
+    for job_id in state:
+        if 'array_ids' in state[job_id]:
+            # this is an array job.
+            for array_id in state[job_id]['array_ids']:
+                state[job_id]['array_ids'][array_id] -= 1
+        else:
+            # this is a standard job.
+            state[job_id]['countdown'] -= 1
+
+    save_state(state, state_file_path)
diff --git a/sequence_processing_pipeline/templates/integrate.sbatch b/sequence_processing_pipeline/templates/integrate.sbatch
new file mode 100644
index 00000000..68ebce5e
--- /dev/null
+++ b/sequence_processing_pipeline/templates/integrate.sbatch
@@ -0,0 +1,67 @@
+#!/bin/bash -l
+#SBATCH -J {{job_name}}
+#SBATCH --time {{wall_time_limit}}
+#SBATCH --mem {{mem_in_gb}}G
+#SBATCH -N {{node_count}}
+#SBATCH -c {{cores_per_task}}
+#SBATCH -p {{queue_name}}
+#SBATCH --array=1-{{barcode_id_count}}
+
+#SBATCH --output {{output_dir}}/logs/integrate_%x_%A_%a.out
+#SBATCH --error {{output_dir}}/logs/integrate_%x_%A_%a.err
+
+set -x
+set -e
+
+samples=($(cat {{output_dir}}/sample_index_list.txt | cut -f 2))
+sample=${samples[$((${SLURM_ARRAY_TASK_ID} - 1))]} 
+
+export TMPDIR={{tmp_dir}}
+
+# get list of samples and determine which sample this array instance will work
+# on.
+samples=($(cat {{output_dir}}/sample_index_list.txt | cut -f 2))
+sample=${samples[$((${SLURM_ARRAY_TASK_ID} - 1))]}
+
+echo "Processing sample ${sample}..."
+
+# make temp directory
+export TMPDIR={{tmp_dir}}
+mkdir -p $TMPDIR
+
+
+# TODO: All three input files must be non-zero in length.
+# If possible, do this check as part of normal FSR operation.
+# Previously this was done right here BEFORE integrating, rather
+# than after.
+
+# NB: non-zero file-length check removed for now. This should be performed
+# by FSR after processing is done.
+# TODO: Make sure raw_fastq_dir is TellReadJob/Full
+r1_in={{raw_fastq_dir}}/TellReadJob_R1_${sample}.fastq.gz.corrected.err_barcode_removed.fastq
+r2_in={{raw_fastq_dir}}/TellReadJob_R2_${sample}.fastq.gz.corrected.err_barcode_removed.fastq
+i1_in={{raw_fastq_dir}}/TellReadJob_I1_${sample}.fastq.gz.corrected.err_barcode_removed.fastq
+
+# create output directory
+mkdir -p {{output_dir}}/integrated
+
+# generate output file names
+r1_out={{output_dir}}/integrated/${sample}.R1.fastq.gz
+r2_out={{output_dir}}/integrated/${sample}.R2.fastq.gz
+i1_out={{output_dir}}/integrated/${sample}.I1.fastq.gz
+
+# generate 'integrated' I1 fastq.gz file. We do this as part of each array so
+# they're done in parallel.
+gzip -c ${i1_in} > ${i1_out}
+
+# generate integrated R1 and R2 fastq.gz files.
+conda activate qp-knight-lab-processing-2022.03
+
+python {{integrate_script_path}} integrate \
+--no-sort \
+--r1-in ${r1_in} \
+--r2-in ${r2_in} \
+--i1-in ${i1_in} \
+--r1-out ${r1_out} \
+--r2-out ${r2_out} \
+--threads {{cores_per_task}}
diff --git a/sequence_processing_pipeline/templates/seq_counts.sbatch b/sequence_processing_pipeline/templates/seq_counts.sbatch
new file mode 100644
index 00000000..f44bd5b9
--- /dev/null
+++ b/sequence_processing_pipeline/templates/seq_counts.sbatch
@@ -0,0 +1,25 @@
+#!/bin/bash -l
+#SBATCH -J {{job_name}}
+#SBATCH --time {{wall_time_limit}}
+#SBATCH --mem {{mem_in_gb}}G
+#SBATCH -N {{node_count}}
+#SBATCH -c {{cores_per_task}}
+#SBATCH -p {{queue_name}}
+#SBATCH --array=1-{{file_count}}
+
+#SBATCH --output {{output_path}}/logs/%x_%A_%a.out
+#SBATCH --error {{output_path}}/logs/%x_%A_%a.err
+
+set -x
+set -e
+
+mkdir -p {{output_path}}/logs
+
+files=($(cat {{output_path}}/files_to_count.txt))
+my_file=${files[$((${SLURM_ARRAY_TASK_ID} - 1))]}
+
+echo "${my_file}"
+
+conda activate qp-knight-lab-processing-2022.03
+
+seqtk size ${my_file}
diff --git a/sequence_processing_pipeline/templates/tellread-cleanup.sbatch b/sequence_processing_pipeline/templates/tellread-cleanup.sbatch
new file mode 100644
index 00000000..3c31219d
--- /dev/null
+++ b/sequence_processing_pipeline/templates/tellread-cleanup.sbatch
@@ -0,0 +1,13 @@
+#!/bin/bash -l
+#SBATCH -J {{job_name}}             # cleanup
+#SBATCH --time {{wall_time_limit}}  # 24:00:00
+#SBATCH --mem {{mem_in_gb}}G        # 8G
+#SBATCH -N {{node_count}}           # 1
+#SBATCH -c {{cores_per_task}}       # 1
+#SBATCH -p {{queue_name}}           # qiita
+
+#SBATCH --output {{output}}/logs/cleanup_%x-%A.out
+#SBATCH --error {{output}}/logs/cleanup_%x-%A.err
+
+# remove unused large outputs
+rm -rf {{OUTPUT}}/biosample_format {{OUTPUT}}/1_demult {{OUTPUT}}/Full
diff --git a/sequence_processing_pipeline/templates/tellread.sbatch b/sequence_processing_pipeline/templates/tellread.sbatch
new file mode 100644
index 00000000..f038a568
--- /dev/null
+++ b/sequence_processing_pipeline/templates/tellread.sbatch
@@ -0,0 +1,48 @@
+#!/bin/bash -l
+#SBATCH -J {{job_name}}
+#SBATCH -p {{queue_name}}
+#SBATCH -N {{node_count}}
+#SBATCH -c {{cores_per_task}}
+#SBATCH --mem {{mem_in_gb}}G
+#SBATCH --time {{wall_time_limit}}
+
+#SBATCH --output {{output}}/logs/tellread_%x-%A.out
+#SBATCH --error {{output}}/logs/tellread_%x-%A.err
+
+set -x
+
+module load {{modules_to_load}}
+{{sing_script_path}} \
+    -i {{rundir_path}} \
+    -o {{output}} \
+    -s $(echo {{samples}} | tr -d '"') \
+    -g $(echo {{refs}} | tr -d '"') \
+    -j ${SLURM_JOB_CPUS_PER_NODE} {{extra}} \
+    -l {{lane}}
+
+# get the timestamp for the most recently changed file in directory '.'
+
+# hard-limit for wait time set to ~ 8 hours.
+# (4 checks per hour, for 8 hours equals 32 iterations)
+for i in $(seq 1 32);
+do
+    before="$(find {{output}}/Full -type f -printf '%T@\n' | sort -n | tail -1)"
+    # assume TellReadJob is finished if ctime hasn't changed in 15 minutes
+    # for any fastq file in the directory.
+    sleep 900
+    after="$(find {{output}}/Full -type f -printf '%T@\n' | sort -n | tail -1)"
+
+    echo "$before   $after"
+
+    if [[ "$before" == "$after" ]]; then
+        echo "DONE"
+        exit 0
+    else
+        echo "NOT DONE"
+    fi
+done
+
+# if we've reached this point then we've exceeded our hard-limit for waiting.
+# return w/an error.
+exit 1
+
diff --git a/sequence_processing_pipeline/tests/data/20230906_FS10001773_68_BTR67708-1611.csv b/sequence_processing_pipeline/tests/data/20230906_FS10001773_68_BTR67708-1611.csv
new file mode 100644
index 00000000..f696f0c9
--- /dev/null
+++ b/sequence_processing_pipeline/tests/data/20230906_FS10001773_68_BTR67708-1611.csv
@@ -0,0 +1,41 @@
+Sample_ID,Sample_Name,Sample_Plate,Sample_Well,Barcode_96_Well_Position,Barcode_ID,Sample_Project,Well_description,Lane
+Person.A.TELLSEQ.R20.microbe,Person.A.TELLSEQ.R20.microbe,TellSeq3_15196_P3,A1,A4,C525,TellSeq3_15196_P3,Person.A.TELLSEQ.R20.microbe,1
+Person.B.TELLSEQ.R24.microbe,Person.B.TELLSEQ.R24.microbe,TellSeq3_15196_P3,B1,B4,C526,TellSeq3_15196_P3,Person.B.TELLSEQ.R24.microbe,1
+Person.C.TELLSEQ.R21.microbe,Person.C.TELLSEQ.R21.microbe,TellSeq3_15196_P3,C1,C4,C527,TellSeq3_15196_P3,Person.C.TELLSEQ.R21.microbe,1
+Person.D.TELLSEQ.R26.microbe,Person.D.TELLSEQ.R26.microbe,TellSeq3_15196_P3,D1,D4,C528,TellSeq3_15196_P3,Person.D.TELLSEQ.R26.microbe,1
+Person.E.TELLSEQ.R19.microbe,Person.E.TELLSEQ.R19.microbe,TellSeq3_15196_P3,E1,E4,C529,TellSeq3_15196_P3,Person.E.TELLSEQ.R19.microbe,1
+Pet.C.TELLSEQ.R23.microbe,Pet.C.TELLSEQ.R23.microbe,TellSeq3_15196_P3,F1,F4,C530,TellSeq3_15196_P3,Pet.C.TELLSEQ.R23.microbe,1
+BLANK.TELLSEQ.3.12.H.microbe,BLANK.TELLSEQ.3.12.H.microbe,TellSeq3_15196_P3,G1,G4,C531,TellSeq3_15196_P3,BLANK.TELLSEQ.3.12.H.microbe,1
+Isolate.115.R1.microbe,Isolate.115.R1.microbe,TellSeq3_15196_P1,H1,H4,C532,TellSeq3_15196_P3,Isolate.115.R1.microbe,1
+Zymo.Mock.Community.R1.microbe,Zymo.Mock.Community.R1.microbe,TellSeq3_15196_P1,A2,A5,C533,TellSeq3_15196_P3,Zymo.Mock.Community.R1.microbe,1
+E.coli.QC.DNA.R1.microbe,E.coli.QC.DNA.R1.microbe,TellSeq3_15196_P1,B2,B5,C534,TellSeq3_15196_P3,E.coli.QC.DNA.R1.microbe,1
+Person.A.TELLSEQ.R20.purified.microbe,Person.A.TELLSEQ.R20.purified.microbe,TellSeq3_15196_P3,C2,C5,C535,TellSeq3_15196_P3,Person.A.TELLSEQ.R20.purified.microbe,1
+Person.B.TELLSEQ.R24.purified.microbe,Person.B.TELLSEQ.R24.purified.microbe,TellSeq3_15196_P3,D2,D5,C536,TellSeq3_15196_P3,Person.B.TELLSEQ.R24.purified.microbe,1
+Person.C.TELLSEQ.R21.purified.microbe,Person.C.TELLSEQ.R21.purified.microbe,TellSeq3_15196_P3,E2,E5,C537,TellSeq3_15196_P3,Person.C.TELLSEQ.R21.purified.microbe,1
+Person.D.TELLSEQ.R26.purified.microbe,Person.D.TELLSEQ.R26.purified.microbe,TellSeq3_15196_P3,F2,F5,C538,TellSeq3_15196_P3,Person.D.TELLSEQ.R26.purified.microbe,1
+Person.E.TELLSEQ.R19.purified.microbe,Person.E.TELLSEQ.R19.purified.microbe,TellSeq3_15196_P3,G2,G5,C539,TellSeq3_15196_P3,Person.E.TELLSEQ.R19.purified.microbe,1
+Pet.C.TELLSEQ.R23.purified.microbe,Pet.C.TELLSEQ.R23.purified.microbe,TellSeq3_15196_P3,H2,H5,C540,TellSeq3_15196_P3,Pet.C.TELLSEQ.R23.purified.microbe,1
+BLANK.TELLSEQ.3.12.H.purified.microbe,BLANK.TELLSEQ.3.12.H.purified.microbe,TellSeq3_15196_P3,A3,A6,C541,TellSeq3_15196_P3,BLANK.TELLSEQ.3.12.H.purified.microbe,1
+Isolate.115.R2.microbe,Isolate.115.R2.microbe,TellSeq3_15196_P1,B3,B6,C542,TellSeq3_15196_P3,Isolate.115.R2.microbe,1
+Zymo.Mock.Community.R2.microbe,Zymo.Mock.Community.R2.microbe,TellSeq3_15196_P1,C3,C6,C543,TellSeq3_15196_P3,Zymo.Mock.Community.R2.microbe,1
+E.coli.QC.DNA.R2.microbe,E.coli.QC.DNA.R2.microbe,TellSeq3_15196_P1,D3,D6,C544,TellSeq3_15196_P3,E.coli.QC.DNA.R2.microbe,1
+Person.A.TELLSEQ.R20.std,Person.A.TELLSEQ.R20.std,TellSeq3_15196_P3,A1,A1,C501,TellSeq3_15196,Person.A.TELLSEQ.R20.std,1
+Person.B.TELLSEQ.R24.std,Person.B.TELLSEQ.R24.std,TellSeq3_15196_P3,B1,B1,C502,TellSeq3_15196,Person.B.TELLSEQ.R24.std,1
+Person.C.TELLSEQ.R21.std,Person.C.TELLSEQ.R21.std,TellSeq3_15196_P3,C1,C1,C503,TellSeq3_15196,Person.C.TELLSEQ.R21.std,1
+Person.D.TELLSEQ.R26.std,Person.D.TELLSEQ.R26.std,TellSeq3_15196_P3,D1,D1,C504,TellSeq3_15196,Person.D.TELLSEQ.R26.std,1
+Person.E.TELLSEQ.R19.std,Person.E.TELLSEQ.R19.std,TellSeq3_15196_P3,E1,E1,C505,TellSeq3_15196,Person.E.TELLSEQ.R19.std,1
+Pet.C.TELLSEQ.R23.std,Pet.C.TELLSEQ.R23.std,TellSeq3_15196_P3,F1,F1,C506,TellSeq3_15196,Pet.C.TELLSEQ.R23.std,1
+BLANK.TELLSEQ.3.12.H.std,BLANK.TELLSEQ.3.12.H.std,TellSeq3_15196_P3,G1,G1,C507,TellSeq3_15196,BLANK.TELLSEQ.3.12.H.std,1
+Isolate.115.R1.std,Isolate.115.R1.std,TellSeq3_15196_P1,H1,H1,C508,TellSeq3_15196,Isolate.115.R1.std,1
+Zymo.Mock.Community.R1.std,Zymo.Mock.Community.R1.std,TellSeq3_15196_P1,A2,A2,C509,TellSeq3_15196,Zymo.Mock.Community.R1.std,1
+E.coli.QC.DNA.R1.std,E.coli.QC.DNA.R1.std,TellSeq3_15196_P1,B2,B2,C510,TellSeq3_15196,E.coli.QC.DNA.R1.std,1
+Person.A.TELLSEQ.R20.purified.std,Person.A.TELLSEQ.R20.purified.std,TellSeq3_15196_P3,C2,C2,C511,TellSeq3_15196,Person.A.TELLSEQ.R20.purified.std,1
+Person.B.TELLSEQ.R24.purified.std,Person.B.TELLSEQ.R24.purified.std,TellSeq3_15196_P3,D2,D2,C512,TellSeq3_15196,Person.B.TELLSEQ.R24.purified.std,1
+Person.C.TELLSEQ.R21.purified.std,Person.C.TELLSEQ.R21.purified.std,TellSeq3_15196_P3,E2,E2,C513,TellSeq3_15196,Person.C.TELLSEQ.R21.purified.std,1
+Person.D.TELLSEQ.R26.purified.std,Person.D.TELLSEQ.R26.purified.std,TellSeq3_15196_P3,F2,F2,C514,TellSeq3_15196,Person.D.TELLSEQ.R26.purified.std,1
+Person.E.TELLSEQ.R19.purified.std,Person.E.TELLSEQ.R19.purified.std,TellSeq3_15196_P3,G2,G2,C515,TellSeq3_15196,Person.E.TELLSEQ.R19.purified.std,1
+Pet.C.TELLSEQ.R23.purified.std,Pet.C.TELLSEQ.R23.purified.std,TellSeq3_15196_P3,H2,H2,C516,TellSeq3_15196,Pet.C.TELLSEQ.R23.purified.std,1
+BLANK.TELLSEQ.3.12.H.purified.std,BLANK.TELLSEQ.3.12.H.purified.std,TellSeq3_15196_P3,A3,A3,C517,TellSeq3_15196,BLANK.TELLSEQ.3.12.H.purified.std,1
+Isolate.115.R2.std,Isolate.115.R2.std,TellSeq3_15196_P1,B3,B3,C518,TellSeq3_15196,Isolate.115.R2.std,1
+Zymo.Mock.Community.R2.std,Zymo.Mock.Community.R2.std,TellSeq3_15196_P1,C3,C3,C519,TellSeq3_15196,Zymo.Mock.Community.R2.std,1
+E.coli.QC.DNA.R2.std,E.coli.QC.DNA.R2.std,TellSeq3_15196_P1,D3,D3,C520,TellSeq3_15196,E.coli.QC.DNA.R2.std,1
diff --git a/sequence_processing_pipeline/tests/data/aggregate_counts_results.json b/sequence_processing_pipeline/tests/data/aggregate_counts_results.json
new file mode 100644
index 00000000..1cae0f05
--- /dev/null
+++ b/sequence_processing_pipeline/tests/data/aggregate_counts_results.json
@@ -0,0 +1,36 @@
+{
+  "REMOVED/8edbdee2-da52-4278-af40-267185bbcd7e/TellReadJob/Full": {
+    "TellReadJob_I1_C520.fastq.gz.erroneous.fastq": {
+      "seq_counts": 2139633,
+      "base_pairs": 38513394
+    },
+    "TellReadJob_R1_C519.fastq.gz.corrected.err_barcode_removed.fastq": {
+      "seq_counts": 64464162,
+      "base_pairs": 8345327641
+    },
+    "TellReadJob_R1_C520.fastq.gz.corrected.err_barcode_removed.fastq": {
+      "seq_counts": 70399028,
+      "base_pairs": 9293296513
+    },
+    "TellReadJob_I1_C519.fastq.gz.erroneous.fastq": {
+      "seq_counts": 1932116,
+      "base_pairs": 34778088
+    },
+    "TellReadJob_I1_C519.fastq.gz.corrected.err_barcode_removed.fastq": {
+      "seq_counts": 64464162,
+      "base_pairs": 1160354916
+    },
+    "TellReadJob_R2_C519.fastq.gz.corrected.err_barcode_removed.fastq": {
+      "seq_counts": 64464162,
+      "base_pairs": 8370238082
+    },
+    "TellReadJob_R2_C520.fastq.gz.corrected.err_barcode_removed.fastq": {
+      "seq_counts": 70399028,
+      "base_pairs": 9317943166
+    },
+    "TellReadJob_I1_C520.fastq.gz.corrected.err_barcode_removed.fastq": {
+      "seq_counts": 70399028,
+      "base_pairs": 1267182504
+    }
+  }
+}
diff --git a/sequence_processing_pipeline/tests/data/configuration_profiles/iseq_metagenomic.json b/sequence_processing_pipeline/tests/data/configuration_profiles/iseq_metagenomic.json
index 089e82f1..c82c76b0 100644
--- a/sequence_processing_pipeline/tests/data/configuration_profiles/iseq_metagenomic.json
+++ b/sequence_processing_pipeline/tests/data/configuration_profiles/iseq_metagenomic.json
@@ -3,6 +3,25 @@
     "instrument_type": "iseq",
     "assay_type": "Metagenomic",
     "configuration": {
+      "tell-seq": {
+        "label": "my_label",
+        "reference_base": "",
+        "reference_map": "",
+        "sing_script_path": "/my_path/tellread-release-novaseqX/run_tellread_sing.sh",
+        "nodes": 1,
+        "lane": 1,
+        "sample_index_list": "/my_path/sample_index_list_1.txt",
+        "queue": "qiita",
+        "wallclock_time_in_minutes": 1440,
+        "modules_to_load": ["singularity_3.6.4"],
+        "integrate_script_path": "/my_path/integrate-indices-np.py",
+        "tellread_mem_limit": "16",
+        "tellread_cores": "4",
+        "normcount_cores": "1",
+        "integrate_cores": "1",
+        "normcount_mem_limit": "8",
+        "integrate_mem_limit": "8"
+      },
       "bcl2fastq": {
         "nodes": 1,
         "nprocs": 16,
diff --git a/sequence_processing_pipeline/tests/data/fake_sample_index_list.txt b/sequence_processing_pipeline/tests/data/fake_sample_index_list.txt
new file mode 100644
index 00000000..1c4345ef
--- /dev/null
+++ b/sequence_processing_pipeline/tests/data/fake_sample_index_list.txt
@@ -0,0 +1,96 @@
+CCCCCACCAA	C501	NONE	PE
+AACCCCCACA	C502	NONE	PE
+CCAACACACC	C503	NONE	PE
+AACACCCCCA	C504	NONE	PE
+CAAAACCCCC	C505	NONE	PE
+ACACACCACC	C506	NONE	PE
+AACCCACACC	C507	NONE	PE
+CAAAAAAAAA	C508	NONE	PE
+AAACCACCCC	C509	NONE	PE
+ACACCCCCCC	C510	NONE	PE
+AAACACCACA	C511	NONE	PE
+CAAAACCCCA	C512	NONE	PE
+ACCCCAACCC	C513	NONE	PE
+CAACCAACAC	C514	NONE	PE
+CCCCCACCCA	C515	NONE	PE
+CCAACCCCCA	C516	NONE	PE
+CAAAACACCC	C517	NONE	PE
+ACACACCAAA	C518	NONE	PE
+CCACCAAAAA	C519	NONE	PE
+AAACCCCCCC	C520	NONE	PE
+AACAACCCCA	C521	NONE	PE
+CAACAACAAC	C522	NONE	PE
+CACCACCAAA	C523	NONE	PE
+CACAACAAAC	C524	NONE	PE
+AACACCCACC	C525	NONE	PE
+CAAAACACAA	C526	NONE	PE
+AAAAAAAAAA	C527	NONE	PE
+CCAACCCCCA	C528	NONE	PE
+CAACCCCAAA	C529	NONE	PE
+ACCCAACCCA	C530	NONE	PE
+CACACCAAAC	C531	NONE	PE
+CAACAAAAAC	C532	NONE	PE
+CCAAAAAAAC	C533	NONE	PE
+ACCAACACAC	C534	NONE	PE
+CCAAAACACC	C535	NONE	PE
+CACCCAAACC	C536	NONE	PE
+CAAACCAAAC	C537	NONE	PE
+CACAAACACA	C538	NONE	PE
+ACCCACCCCC	C539	NONE	PE
+AACCACACAC	C540	NONE	PE
+CACCCCCACA	C541	NONE	PE
+CACAACCACC	C542	NONE	PE
+AAAAACAAAA	C543	NONE	PE
+CCACACAAAC	C544	NONE	PE
+AAACAAACAC	C545	NONE	PE
+ACCCCCACCC	C546	NONE	PE
+ACACCCCAAA	C547	NONE	PE
+CAAACCAAAC	C548	NONE	PE
+AAACCACCAA	C549	NONE	PE
+CAACCAAAAC	C550	NONE	PE
+ACACCCCCCC	C551	NONE	PE
+CACCCACCAC	C552	NONE	PE
+ACCCCAACCC	C553	NONE	PE
+AAACCCAACA	C554	NONE	PE
+ACCACACAAA	C555	NONE	PE
+ACCCACCACC	C556	NONE	PE
+CCCCAACCAA	C557	NONE	PE
+CAAAAACACC	C558	NONE	PE
+ACCACAAAAC	C559	NONE	PE
+ACCCCCCCAA	C560	NONE	PE
+CCACAAAACA	C561	NONE	PE
+CAAACCCACC	C562	NONE	PE
+ACACCACAAA	C563	NONE	PE
+ACCAACAAAA	C564	NONE	PE
+CCCAACAAAA	C565	NONE	PE
+CACCCCCCCA	C566	NONE	PE
+AAACCCACCA	C567	NONE	PE
+CACACCACAA	C568	NONE	PE
+CCAAACCCCA	C569	NONE	PE
+CACCCCACCC	C570	NONE	PE
+AAACCCCCAA	C571	NONE	PE
+ACACCACACC	C572	NONE	PE
+ACAAAACACC	C573	NONE	PE
+CACAAACCAC	C574	NONE	PE
+ACCCCCACAA	C575	NONE	PE
+CCCCAAACCC	C576	NONE	PE
+AAAACACAAC	C577	NONE	PE
+AACCCAACCA	C578	NONE	PE
+AAACACACAC	C579	NONE	PE
+AAACACCACC	C580	NONE	PE
+AACCCCAACA	C581	NONE	PE
+CCACCCAAAC	C582	NONE	PE
+CCAAAACAAC	C583	NONE	PE
+ACCAACAAAC	C584	NONE	PE
+AAACACCACC	C585	NONE	PE
+AACCACACAC	C586	NONE	PE
+CACAACAAAA	C587	NONE	PE
+AACCAAAAAC	C588	NONE	PE
+ACCAAACCAA	C589	NONE	PE
+ACAAACACAC	C590	NONE	PE
+ACCACACCAA	C591	NONE	PE
+AAAAACAACC	C592	NONE	PE
+CACACAACAC	C593	NONE	PE
+CCCCCAACCC	C594	NONE	PE
+ACACAAAACC	C595	NONE	PE
+CCACCACACC	C596	NONE	PE
diff --git a/sequence_processing_pipeline/tests/data/files_to_count.txt b/sequence_processing_pipeline/tests/data/files_to_count.txt
new file mode 100644
index 00000000..8d7ce4b1
--- /dev/null
+++ b/sequence_processing_pipeline/tests/data/files_to_count.txt
@@ -0,0 +1,8 @@
+/ddn_scratch/qiita_t/working_dir/8edbdee2-da52-4278-af40-267185bbcd7e/TellReadJob/Full/TellReadJob_I1_C519.fastq.gz.corrected.err_barcode_removed.fastq
+/ddn_scratch/qiita_t/working_dir/8edbdee2-da52-4278-af40-267185bbcd7e/TellReadJob/Full/TellReadJob_I1_C519.fastq.gz.erroneous.fastq
+/ddn_scratch/qiita_t/working_dir/8edbdee2-da52-4278-af40-267185bbcd7e/TellReadJob/Full/TellReadJob_I1_C520.fastq.gz.corrected.err_barcode_removed.fastq
+/ddn_scratch/qiita_t/working_dir/8edbdee2-da52-4278-af40-267185bbcd7e/TellReadJob/Full/TellReadJob_I1_C520.fastq.gz.erroneous.fastq
+/ddn_scratch/qiita_t/working_dir/8edbdee2-da52-4278-af40-267185bbcd7e/TellReadJob/Full/TellReadJob_R1_C519.fastq.gz.corrected.err_barcode_removed.fastq
+/ddn_scratch/qiita_t/working_dir/8edbdee2-da52-4278-af40-267185bbcd7e/TellReadJob/Full/TellReadJob_R1_C520.fastq.gz.corrected.err_barcode_removed.fastq
+/ddn_scratch/qiita_t/working_dir/8edbdee2-da52-4278-af40-267185bbcd7e/TellReadJob/Full/TellReadJob_R2_C519.fastq.gz.corrected.err_barcode_removed.fastq
+/ddn_scratch/qiita_t/working_dir/8edbdee2-da52-4278-af40-267185bbcd7e/TellReadJob/Full/TellReadJob_R2_C520.fastq.gz.corrected.err_barcode_removed.fastq
diff --git a/sequence_processing_pipeline/tests/data/seq_counts.sbatch b/sequence_processing_pipeline/tests/data/seq_counts.sbatch
new file mode 100644
index 00000000..cc73187c
--- /dev/null
+++ b/sequence_processing_pipeline/tests/data/seq_counts.sbatch
@@ -0,0 +1,25 @@
+#!/bin/bash -l
+#SBATCH -J seq_counts
+#SBATCH --time 1440
+#SBATCH --mem 8G
+#SBATCH -N 1
+#SBATCH -c 1
+#SBATCH -p qiita
+#SBATCH --array=1-8
+
+#SBATCH --output sequence_processing_pipeline/tests/2caa8226-cf69-45a3-bd40-1e90ec3d18d0/SeqCountsJob/logs/%x_%A_%a.out
+#SBATCH --error sequence_processing_pipeline/tests/2caa8226-cf69-45a3-bd40-1e90ec3d18d0/SeqCountsJob/logs/%x_%A_%a.err
+
+set -x
+set -e
+
+mkdir -p sequence_processing_pipeline/tests/2caa8226-cf69-45a3-bd40-1e90ec3d18d0/SeqCountsJob/logs
+
+files=($(cat sequence_processing_pipeline/tests/2caa8226-cf69-45a3-bd40-1e90ec3d18d0/SeqCountsJob/files_to_count.txt))
+my_file=${files[$((${SLURM_ARRAY_TASK_ID} - 1))]}
+
+echo "${my_file}"
+
+conda activate qp-knight-lab-processing-2022.03
+
+seqtk size ${my_file}
diff --git a/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_1.err b/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_1.err
new file mode 100644
index 00000000..47c59651
--- /dev/null
+++ b/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_1.err
@@ -0,0 +1,3 @@
+This is an example .err file produced by seq_counts.sbatch.
+Additional details removed.
++ seqtk size REMOVED/working_dir/8edbdee2-da52-4278-af40-267185bbcd7e/TellReadJob/Full/TellReadJob_R1_C519.fastq.gz.corrected.err_barcode_removed.fastq
diff --git a/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_1.out b/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_1.out
new file mode 100644
index 00000000..50a46674
--- /dev/null
+++ b/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_1.out
@@ -0,0 +1,2 @@
+REMOVED/8edbdee2-da52-4278-af40-267185bbcd7e/TellReadJob/Full/TellReadJob_R1_C519.fastq.gz.corrected.err_barcode_removed.fastq
+64464162	8345327641
diff --git a/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_2.err b/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_2.err
new file mode 100644
index 00000000..47c59651
--- /dev/null
+++ b/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_2.err
@@ -0,0 +1,3 @@
+This is an example .err file produced by seq_counts.sbatch.
+Additional details removed.
++ seqtk size REMOVED/working_dir/8edbdee2-da52-4278-af40-267185bbcd7e/TellReadJob/Full/TellReadJob_R1_C519.fastq.gz.corrected.err_barcode_removed.fastq
diff --git a/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_2.out b/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_2.out
new file mode 100644
index 00000000..87ad9f55
--- /dev/null
+++ b/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_2.out
@@ -0,0 +1,2 @@
+REMOVED/8edbdee2-da52-4278-af40-267185bbcd7e/TellReadJob/Full/TellReadJob_R1_C520.fastq.gz.corrected.err_barcode_removed.fastq
+70399028	9293296513
diff --git a/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_3.err b/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_3.err
new file mode 100644
index 00000000..e9c0cf9d
--- /dev/null
+++ b/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_3.err
@@ -0,0 +1,3 @@
+This is an example .err file produced by seq_counts.sbatch.
+Additional details removed.
++ seqtk size REMOVED/working_dir/8edbdee2-da52-4278-af40-267185bbcd7e/TellReadJob/Full/TellReadJob_R1_C520.fastq.gz.corrected.err_barcode_removed.fastq
diff --git a/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_3.out b/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_3.out
new file mode 100644
index 00000000..a22d9f8d
--- /dev/null
+++ b/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_3.out
@@ -0,0 +1,2 @@
+REMOVED/8edbdee2-da52-4278-af40-267185bbcd7e/TellReadJob/Full/TellReadJob_I1_C519.fastq.gz.erroneous.fastq
+1932116	34778088
diff --git a/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_4.err b/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_4.err
new file mode 100644
index 00000000..e9c0cf9d
--- /dev/null
+++ b/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_4.err
@@ -0,0 +1,3 @@
+This is an example .err file produced by seq_counts.sbatch.
+Additional details removed.
++ seqtk size REMOVED/working_dir/8edbdee2-da52-4278-af40-267185bbcd7e/TellReadJob/Full/TellReadJob_R1_C520.fastq.gz.corrected.err_barcode_removed.fastq
diff --git a/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_4.out b/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_4.out
new file mode 100644
index 00000000..0b35614a
--- /dev/null
+++ b/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_4.out
@@ -0,0 +1,2 @@
+REMOVED/8edbdee2-da52-4278-af40-267185bbcd7e/TellReadJob/Full/TellReadJob_R2_C520.fastq.gz.corrected.err_barcode_removed.fastq
+70399028	9317943166
diff --git a/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_5.err b/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_5.err
new file mode 100644
index 00000000..47c59651
--- /dev/null
+++ b/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_5.err
@@ -0,0 +1,3 @@
+This is an example .err file produced by seq_counts.sbatch.
+Additional details removed.
++ seqtk size REMOVED/working_dir/8edbdee2-da52-4278-af40-267185bbcd7e/TellReadJob/Full/TellReadJob_R1_C519.fastq.gz.corrected.err_barcode_removed.fastq
diff --git a/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_5.out b/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_5.out
new file mode 100644
index 00000000..887522ae
--- /dev/null
+++ b/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_5.out
@@ -0,0 +1,2 @@
+REMOVED/8edbdee2-da52-4278-af40-267185bbcd7e/TellReadJob/Full/TellReadJob_I1_C520.fastq.gz.corrected.err_barcode_removed.fastq
+70399028	1267182504
diff --git a/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_6.err b/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_6.err
new file mode 100644
index 00000000..e9c0cf9d
--- /dev/null
+++ b/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_6.err
@@ -0,0 +1,3 @@
+This is an example .err file produced by seq_counts.sbatch.
+Additional details removed.
++ seqtk size REMOVED/working_dir/8edbdee2-da52-4278-af40-267185bbcd7e/TellReadJob/Full/TellReadJob_R1_C520.fastq.gz.corrected.err_barcode_removed.fastq
diff --git a/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_6.out b/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_6.out
new file mode 100644
index 00000000..a4fbd555
--- /dev/null
+++ b/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_6.out
@@ -0,0 +1,2 @@
+REMOVED/8edbdee2-da52-4278-af40-267185bbcd7e/TellReadJob/Full/TellReadJob_R2_C519.fastq.gz.corrected.err_barcode_removed.fastq
+64464162	8370238082
diff --git a/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_7.err b/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_7.err
new file mode 100644
index 00000000..47c59651
--- /dev/null
+++ b/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_7.err
@@ -0,0 +1,3 @@
+This is an example .err file produced by seq_counts.sbatch.
+Additional details removed.
++ seqtk size REMOVED/working_dir/8edbdee2-da52-4278-af40-267185bbcd7e/TellReadJob/Full/TellReadJob_R1_C519.fastq.gz.corrected.err_barcode_removed.fastq
diff --git a/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_7.out b/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_7.out
new file mode 100644
index 00000000..6c6a9c06
--- /dev/null
+++ b/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_7.out
@@ -0,0 +1,2 @@
+REMOVED/8edbdee2-da52-4278-af40-267185bbcd7e/TellReadJob/Full/TellReadJob_I1_C519.fastq.gz.corrected.err_barcode_removed.fastq
+64464162	1160354916
diff --git a/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_8.err b/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_8.err
new file mode 100644
index 00000000..e9c0cf9d
--- /dev/null
+++ b/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_8.err
@@ -0,0 +1,3 @@
+This is an example .err file produced by seq_counts.sbatch.
+Additional details removed.
++ seqtk size REMOVED/working_dir/8edbdee2-da52-4278-af40-267185bbcd7e/TellReadJob/Full/TellReadJob_R1_C520.fastq.gz.corrected.err_barcode_removed.fastq
diff --git a/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_8.out b/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_8.out
new file mode 100644
index 00000000..9be52329
--- /dev/null
+++ b/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_8.out
@@ -0,0 +1,2 @@
+REMOVED/8edbdee2-da52-4278-af40-267185bbcd7e/TellReadJob/Full/TellReadJob_I1_C520.fastq.gz.erroneous.fastq
+2139633	38513394
diff --git a/sequence_processing_pipeline/tests/data/tellseq_metag_dummy_sample_sheet.csv b/sequence_processing_pipeline/tests/data/tellseq_metag_dummy_sample_sheet.csv
new file mode 100644
index 00000000..105330fd
--- /dev/null
+++ b/sequence_processing_pipeline/tests/data/tellseq_metag_dummy_sample_sheet.csv
@@ -0,0 +1,135 @@
+[Header],,,,,,,,
+IEMFileVersion,1,,,,,,,
+SheetType,tellseq_metag,,,,,,,
+SheetVersion,10,,,,,,,
+Investigator Name,Knight,,,,,,,
+Experiment Name,RKL0151,,,,,,,
+Date,5/6/24,,,,,,,
+Workflow,GenerateFASTQ,,,,,,,
+Application,FASTQ Only,,,,,,,
+Assay,Metagenomic,,,,,,,
+Description,,,,,,,,
+Chemistry,Default,,,,,,,
+,,,,,,,,
+[Reads],,,,,,,,
+151,,,,,,,,
+151,,,,,,,,
+,,,,,,,,
+[Settings],,,,,,,,
+ReverseComplement,0,,,,,,,
+,,,,,,,,
+[Data],,,,,,,,
+Sample_ID,Sample_Name,Sample_Plate,well_id_384,barcode_id,Sample_Project,Well_description,Lane,
+LS_8_10_2013_SRE,LS.8.10.2013.SRE,LS_Donor_SS_Samples_P1,A1,C501,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.8.10.2013.SRE,1,
+LS_12_17_2014_SRE,LS.12.17.2014.SRE,LS_Donor_SS_Samples_P1,B1,C509,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.12.17.2014.SRE,1,
+LS_4_4_2015_SRE,LS.4.4.2015.SRE,LS_Donor_SS_Samples_P1,C1,C502,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.4.4.2015.SRE,1,
+LS_2_23_2015_SRE,LS.2.23.2015.SRE,LS_Donor_SS_Samples_P1,D1,C510,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.2.23.2015.SRE,1,
+LS_9_28_2014_SRE,LS.9.28.2014.SRE,LS_Donor_SS_Samples_P1,E1,C503,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.9.28.2014.SRE,1,
+LS_12_14_2013_SRE,LS.12.14.2013.SRE,LS_Donor_SS_Samples_P1,F1,C511,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.12.14.2013.SRE,1,
+LS_4_7_2013_SRE,LS.4.7.2013.SRE,LS_Donor_SS_Samples_P1,G1,C504,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.4.7.2013.SRE,1,
+LS_7_14_2013_SRE,LS.7.14.2013.SRE,LS_Donor_SS_Samples_P1,H1,C512,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.7.14.2013.SRE,1,
+LS_10_27_2013_SRE,LS.10.27.2013.SRE,LS_Donor_SS_Samples_P1,I1,C505,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.10.27.2013.SRE,1,
+LS_1_19_2014_SRE,LS.1.19.2014.SRE,LS_Donor_SS_Samples_P1,J1,C513,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.1.19.2014.SRE,1,
+LS_9_3_2013_SRE,LS.9.3.2013.SRE,LS_Donor_SS_Samples_P1,K1,C506,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.9.3.2013.SRE,1,
+LS_2_25_2013_SRE,LS.2.25.2013.SRE,LS_Donor_SS_Samples_P1,L1,C514,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.2.25.2013.SRE,1,
+LS_7_26_2015_SRE,LS.7.26.2015.SRE,LS_Donor_SS_Samples_P1,M1,C507,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.7.26.2015.SRE,1,
+LS_2_17_2014_SRE,LS.2.17.2014.SRE,LS_Donor_SS_Samples_P1,N1,C515,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.2.17.2014.SRE,1,
+LS_6_29_2015_SRE,LS.6.29.2015.SRE,LS_Donor_SS_Samples_P1,O1,C508,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.6.29.2015.SRE,1,
+LS_3_24_2015_SRE,LS.3.24.2015.SRE,LS_Donor_SS_Samples_P1,P1,C516,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.3.24.2015.SRE,1,
+LS_1_6_2015_SRE,LS.1.6.2015.SRE,LS_Donor_SS_Samples_P1,A2,C517,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.1.6.2015.SRE,1,
+T_LS_7_15_15B_SRE,T.LS.7.15.15B.SRE,LS_Donor_SS_Samples_P1,B2,C525,Tellseq_Shortread_Metagenomic_Analysis_10283,T.LS.7.15.15B.SRE,1,
+LS_6_9_2013_SRE,LS.6.9.2013.SRE,LS_Donor_SS_Samples_P1,C2,C518,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.6.9.2013.SRE,1,
+Person A_SRE,Person A.SRE,LS_Donor_SS_Samples_P1,D2,C526,Tellseq_Shortread_Metagenomic_Analysis_10283,Person A.SRE,1,
+LS_8_22_2014_R2_SRE,LS.8.22.2014.R2.SRE,LS_Donor_SS_Samples_P1,E2,C519,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.8.22.2014.R2.SRE,1,
+Person B_SRE,Person B.SRE,LS_Donor_SS_Samples_P1,F2,C527,Tellseq_Shortread_Metagenomic_Analysis_10283,Person B.SRE,1,
+LS_8_22_2014_R1_SRE,LS.8.22.2014.R1.SRE,LS_Donor_SS_Samples_P1,G2,C520,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.8.22.2014.R1.SRE,1,
+Person C_SRE,Person C.SRE,LS_Donor_SS_Samples_P1,H2,C528,Tellseq_Shortread_Metagenomic_Analysis_10283,Person C.SRE,1,
+LS_12_28_2011_SRE,LS.12.28.2011.SRE,LS_Donor_SS_Samples_P1,I2,C521,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.12.28.2011.SRE,1,
+Person D_SRE,Person D.SRE,LS_Donor_SS_Samples_P1,J2,C529,Tellseq_Shortread_Metagenomic_Analysis_10283,Person D.SRE,1,
+LS_5_4_2014_SRE,LS.5.4.2014.SRE,LS_Donor_SS_Samples_P1,K2,C522,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.5.4.2014.SRE,1,
+45208_1_1,45208.1.1,UROBIOME_TEST_MF_SAMPLES_P2,L2,C530,Tellseq_Shortread_Metagenomic_Analysis_10283,45208.1.1,1,
+LS_11_6_2012_SRE,LS.11.6.2012.SRE,LS_Donor_SS_Samples_P1,M2,C523,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.11.6.2012.SRE,1,
+45248_2_2,45248.2.2,UROBIOME_TEST_MF_SAMPLES_P2,N2,C531,Tellseq_Shortread_Metagenomic_Analysis_10283,45248.2.2,1,
+LS_4_3_2012_SRE,LS.4.3.2012.SRE,LS_Donor_SS_Samples_P1,O2,C524,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.4.3.2012.SRE,1,
+45261_2_1,45261.2.1,UROBIOME_TEST_MF_SAMPLES_P2,P2,C532,Tellseq_Shortread_Metagenomic_Analysis_10283,45261.2.1,1,
+45272_11_2,45272.11.2,UROBIOME_TEST_MF_SAMPLES_P2,A3,C533,Tellseq_Shortread_Metagenomic_Analysis_10283,45272.11.2,1,
+T_LS_7_12_15A,T.LS.7.12.15A,Larry_Smarr_Plus_Donor_Samples_P3,B3,C541,Tellseq_Shortread_Metagenomic_Analysis_10283,T.LS.7.12.15A,1,
+45316_8_1,45316.8.1,UROBIOME_TEST_MF_SAMPLES_P2,C3,C534,Tellseq_Shortread_Metagenomic_Analysis_10283,45316.8.1,1,
+T_LS_7_8_15A,T.LS.7.8.15A,Larry_Smarr_Plus_Donor_Samples_P3,D3,C542,Tellseq_Shortread_Metagenomic_Analysis_10283,T.LS.7.8.15A,1,
+45327_7_2,45327.7.2,UROBIOME_TEST_MF_SAMPLES_P2,E3,C535,Tellseq_Shortread_Metagenomic_Analysis_10283,45327.7.2,1,
+LS_8_10_2013,LS.8.10.2013,LS_Time_Series_ABSQ_P4,F3,C543,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.8.10.2013,1,
+45272_1_swab_2,45272.1.swab.2,UROBIOME_TEST_MF_SAMPLES_P2,G3,C536,Tellseq_Shortread_Metagenomic_Analysis_10283,45272.1.swab.2,1,
+LS_6_29_2015,LS.6.29.2015,LS_Time_Series_ABSQ_P4,H3,C544,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.6.29.2015,1,
+45326_1_swab_2,45326.1.swab.2,UROBIOME_TEST_MF_SAMPLES_P2,I3,C537,Tellseq_Shortread_Metagenomic_Analysis_10283,45326.1.swab.2,1,
+LS_3_8_2015,LS.3.8.2015,LS_Time_Series_ABSQ_P4,J3,C545,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.3.8.2015,1,
+T_LS_7_19_15A,T.LS.7.19.15A,Larry_Smarr_Plus_Donor_Samples_P3,K3,C538,Tellseq_Shortread_Metagenomic_Analysis_10283,T.LS.7.19.15A,1,
+LS_4_29_2013,LS.4.29.2013,LS_Time_Series_ABSQ_P4,L3,C546,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.4.29.2013,1,
+T_LS_7_15_15B,T.LS.7.15.15B,Larry_Smarr_Plus_Donor_Samples_P3,M3,C539,Tellseq_Shortread_Metagenomic_Analysis_10283,T.LS.7.15.15B,1,
+LS_11_16_2014,LS.11.16.2014,LS_Time_Series_ABSQ_P4,N3,C547,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.11.16.2014,1,
+T_LS_7_19_15B,T.LS.7.19.15B,Larry_Smarr_Plus_Donor_Samples_P3,O3,C540,Tellseq_Shortread_Metagenomic_Analysis_10283,T.LS.7.19.15B,1,
+LS_1_19_2014,LS.1.19.2014,LS_Time_Series_ABSQ_P4,P3,C548,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.1.19.2014,1,
+LS_3_24_2015,LS.3.24.2015,LS_Time_Series_ABSQ_P4,A4,C549,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.3.24.2015,1,
+LS_2_8_2013,LS.2.8.2013,LS_Time_Series_ABSQ_P4,B4,C557,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.2.8.2013,1,
+LS_11_10_2013,LS.11.10.2013,LS_Time_Series_ABSQ_P4,C4,C550,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.11.10.2013,1,
+Marine_Sediment_0_2cm_R1,Marine.Sediment.0.2cm.R1,MarineSediment_Donor_LarrySmarr_NoProK_P5,D4,C558,Tellseq_Shortread_Metagenomic_Analysis_10283,Marine.Sediment.0.2cm.R1,1,
+LS_3_23_2014,LS.3.23.2014,LS_Time_Series_ABSQ_P4,E4,C551,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.3.23.2014,1,
+Marine_Sediment_5_7cm_R1,Marine.Sediment.5.7cm.R1,MarineSediment_Donor_LarrySmarr_NoProK_P5,F4,C559,Tellseq_Shortread_Metagenomic_Analysis_10283,Marine.Sediment.5.7cm.R1,1,
+LS_1_14_2015,LS.1.14.2015,LS_Time_Series_ABSQ_P4,G4,C552,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.1.14.2015,1,
+Marine_Sediment_10_12cm_R2,Marine.Sediment.10.12cm.R2,MarineSediment_Donor_LarrySmarr_NoProK_P5,H4,C560,Tellseq_Shortread_Metagenomic_Analysis_10283,Marine.Sediment.10.12cm.R2,1,
+LS_8_25_2014,LS.8.25.2014,LS_Time_Series_ABSQ_P4,I4,C553,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.8.25.2014,1,
+Marine_Sediment_15_17cm_R1,Marine.Sediment.15.17cm.R1,MarineSediment_Donor_LarrySmarr_NoProK_P5,J4,C561,Tellseq_Shortread_Metagenomic_Analysis_10283,Marine.Sediment.15.17cm.R1,1,
+LS_1_26_2013,LS.1.26.2013,LS_Time_Series_ABSQ_P4,K4,C554,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.1.26.2013,1,
+Marine_Sediment_20_22cm_R1,Marine.Sediment.20.22cm.R1,MarineSediment_Donor_LarrySmarr_NoProK_P5,L4,C562,Tellseq_Shortread_Metagenomic_Analysis_10283,Marine.Sediment.20.22cm.R1,1,
+LS_6_16_2014,LS.6.16.2014,LS_Time_Series_ABSQ_P4,M4,C555,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.6.16.2014,1,
+Marine_Sediment_25_27cm_R2,Marine.Sediment.25.27cm.R2,MarineSediment_Donor_LarrySmarr_NoProK_P5,N4,C563,Tellseq_Shortread_Metagenomic_Analysis_10283,Marine.Sediment.25.27cm.R2,1,
+LS_7_27_2014,LS.7.27.2014,LS_Time_Series_ABSQ_P4,O4,C556,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.7.27.2014,1,
+Marine_Sediment_30_32cm_R3,Marine.Sediment.30.32cm.R3,MarineSediment_Donor_LarrySmarr_NoProK_P5,P4,C564,Tellseq_Shortread_Metagenomic_Analysis_10283,Marine.Sediment.30.32cm.R3,1,
+Person_A_R3,Person.A.R3,MarineSediment_Donor_LarrySmarr_NoProK_P5,A5,C565,Tellseq_Shortread_Metagenomic_Analysis_10283,Person.A.R3,1,
+Soil_SynCom_T4_2_Tube5,Soil.SynCom.T4.2.Tube5,16_member_community_native_soil_P6,B5,C573,Tellseq_Shortread_Metagenomic_Analysis_10283,Soil.SynCom.T4.2.Tube5,1,
+Person_B_R2,Person.B.R2,MarineSediment_Donor_LarrySmarr_NoProK_P5,C5,C566,Tellseq_Shortread_Metagenomic_Analysis_10283,Person.B.R2,1,
+A21,A21,Tumor_Community_P7,D5,C574,Tellseq_Shortread_Metagenomic_Analysis_10283,A21,1,
+Person_C_R4,Person.C.R4,MarineSediment_Donor_LarrySmarr_NoProK_P5,E5,C567,Tellseq_Shortread_Metagenomic_Analysis_10283,Person.C.R4,1,
+A23,A23,Tumor_Community_P7,F5,C575,Tellseq_Shortread_Metagenomic_Analysis_10283,A23,1,
+Person_D_R2,Person.D.R2,MarineSediment_Donor_LarrySmarr_NoProK_P5,G5,C568,Tellseq_Shortread_Metagenomic_Analysis_10283,Person.D.R2,1,
+A27,A27,Tumor_Community_P7,H5,C576,Tellseq_Shortread_Metagenomic_Analysis_10283,A27,1,
+Soil_SynCom_T1_2_Tube1,Soil.SynCom.T1.2.Tube1,16_member_community_native_soil_P6,I5,C569,Tellseq_Shortread_Metagenomic_Analysis_10283,Soil.SynCom.T1.2.Tube1,1,
+A30,A30,Tumor_Community_P7,J5,C577,Tellseq_Shortread_Metagenomic_Analysis_10283,A30,1,
+Soil _SynCom_T2_2_Tube2,Soil .SynCom.T2.2.Tube2,16_member_community_native_soil_P6,K5,C570,Tellseq_Shortread_Metagenomic_Analysis_10283,Soil .SynCom.T2.2.Tube2,1,
+A31,A31,Tumor_Community_P7,L5,C578,Tellseq_Shortread_Metagenomic_Analysis_10283,A31,1,
+Soil_SynCom_T3_2_Tube3,Soil.SynCom.T3.2.Tube3,16_member_community_native_soil_P6,M5,C571,Tellseq_Shortread_Metagenomic_Analysis_10283,Soil.SynCom.T3.2.Tube3,1,
+S1_T1_A,S1.T1.A,Tumor_Community_P7,N5,C579,Tellseq_Shortread_Metagenomic_Analysis_10283,S1.T1.A,1,
+Soil_SynCom_T4_1_Tube4,Soil.SynCom.T4.1.Tube4,16_member_community_native_soil_P6,O5,C572,Tellseq_Shortread_Metagenomic_Analysis_10283,Soil.SynCom.T4.1.Tube4,1,
+S2_T1_B_A,S2.T1.B.A,Tumor_Community_P7,P5,C580,Tellseq_Shortread_Metagenomic_Analysis_10283,S2.T1.B.A,1,
+S2_T1_01BH1_Y_A,S2.T1.01BH1.Y.A,Tumor_Community_P7,A6,C581,Tellseq_Shortread_Metagenomic_Analysis_10283,S2.T1.01BH1.Y.A,1,
+S1_T1_1CIM_A,S1.T1.1CIM.A,Tumor_Community_P7,B6,C589,Tellseq_Shortread_Metagenomic_Analysis_10283,S1.T1.1CIM.A,1,
+S2_MT1_1HBI_Y_A,S2.MT1.1HBI.Y.A,Tumor_Community_P7,C6,C582,Tellseq_Shortread_Metagenomic_Analysis_10283,S2.MT1.1HBI.Y.A,1,
+S1_M1_B_1CIM_A,S1.M1.B.1CIM.A,Tumor_Community_P7,D6,C590,Tellseq_Shortread_Metagenomic_Analysis_10283,S1.M1.B.1CIM.A,1,
+S1_T1_B_LBM_A,S1.T1.B.LBM.A,Tumor_Community_P7,E6,C583,Tellseq_Shortread_Metagenomic_Analysis_10283,S1.T1.B.LBM.A,1,
+BLANK_K15_cancer_patient,BLANK.K15.cancer.patient,Tumor_Community_P7,F6,C591,Tellseq_Shortread_Metagenomic_Analysis_10283,BLANK.K15.cancer.patient,1,
+S2_MT1_LBM_A,S2.MT1.LBM.A,Tumor_Community_P7,G6,C584,Tellseq_Shortread_Metagenomic_Analysis_10283,S2.MT1.LBM.A,1,
+BLANK_M15_cancer_patient,BLANK.M15.cancer.patient,Tumor_Community_P7,H6,C592,Tellseq_Shortread_Metagenomic_Analysis_10283,BLANK.M15.cancer.patient,1,
+S2_T1_A,S2.T1.A,Tumor_Community_P7,I6,C585,Tellseq_Shortread_Metagenomic_Analysis_10283,S2.T1.A,1,
+BLANK_O15_cancer_patient,BLANK.O15.cancer.patient,Tumor_Community_P7,J6,C593,Tellseq_Shortread_Metagenomic_Analysis_10283,BLANK.O15.cancer.patient,1,
+1CIM_M_CNTL_A,1CIM.M.CNTL.A,Tumor_Community_P7,K6,C586,Tellseq_Shortread_Metagenomic_Analysis_10283,1CIM.M.CNTL.A,1,
+BLANK_A17_cancer_patient,BLANK.A17.cancer.patient,Tumor_Community_P7,L6,C594,Tellseq_Shortread_Metagenomic_Analysis_10283,BLANK.A17.cancer.patient,1,
+1CIM_G_CNTL_A,1CIM.G.CNTL.A,Tumor_Community_P7,M6,C587,Tellseq_Shortread_Metagenomic_Analysis_10283,1CIM.G.CNTL.A,1,
+BLANK_C17_cancer_patient,BLANK.C17.cancer.patient,Tumor_Community_P7,N6,C595,Tellseq_Shortread_Metagenomic_Analysis_10283,BLANK.C17.cancer.patient,1,
+GC_1HCOM_A,GC.1HCOM.A,Tumor_Community_P7,O6,C588,Tellseq_Shortread_Metagenomic_Analysis_10283,GC.1HCOM.A,1,
+BLANK_E17_cancer_patient,BLANK.E17.cancer.patient,Tumor_Community_P7,P6,C596,Tellseq_Shortread_Metagenomic_Analysis_10283,BLANK.E17.cancer.patient,1,
+,,,,,,,,
+[Bioinformatics],,,,,,,,
+Sample_Project,QiitaID,BarcodesAreRC,ForwardAdapter,ReverseAdapter,HumanFiltering,library_construction_protocol,experiment_design_description,contains_replicates
+Tellseq_Shortread_Metagenomic_Analysis_10283,10283,TRUE,GATCGGAAGAGCACACGTCTGAACTCCAGTCAC,GATCGGAAGAGCGTCGTGTAGGGAAAGGAGTGT,TRUE,tellseq,tellseq metagenomics,FALSE
+,,,,,,,,
+[Contact],,,,,,,,
+Sample_Project,Email,,,,,,,
+Tellseq_Shortread_Metagenomic_Analysis_10283,cbrenchy@gmail.com,,,,,,,
+,,,,,,,,
+[SampleContext],,,,,,,,
+sample_name,sample_type,primary_qiita_study,secondary_qiita_studies,,,,,
+BLANK.K15.cancer.patient,control blank,10283,,,,,,
+BLANK.M15.cancer.patient,control blank,10283,,,,,,
+BLANK.O15.cancer.patient,control blank,10283,,,,,,
+BLANK.A17.cancer.patient,control blank,10283,,,,,,
+BLANK.C17.cancer.patient,control blank,10283,,,,,,
+BLANK.E17.cancer.patient,control blank,10283,,,,,,
\ No newline at end of file
diff --git a/sequence_processing_pipeline/tests/data/tellseq_output/integrate_test.sbatch b/sequence_processing_pipeline/tests/data/tellseq_output/integrate_test.sbatch
new file mode 100644
index 00000000..f7a53198
--- /dev/null
+++ b/sequence_processing_pipeline/tests/data/tellseq_output/integrate_test.sbatch
@@ -0,0 +1,67 @@
+#!/bin/bash -l
+#SBATCH -J integrate
+#SBATCH --time 96:00:00
+#SBATCH --mem 16G
+#SBATCH -N 1
+#SBATCH -c 4
+#SBATCH -p qiita
+#SBATCH --array=1-96
+
+#SBATCH --output sequence_processing_pipeline/tests/2caa8226-cf69-45a3-bd40-1e90ec3d18d0/TRIntegrateJob/logs/integrate_%x_%A_%a.out
+#SBATCH --error sequence_processing_pipeline/tests/2caa8226-cf69-45a3-bd40-1e90ec3d18d0/TRIntegrateJob/logs/integrate_%x_%A_%a.err
+
+set -x
+set -e
+
+samples=($(cat sequence_processing_pipeline/tests/2caa8226-cf69-45a3-bd40-1e90ec3d18d0/TRIntegrateJob/sample_index_list.txt | cut -f 2))
+sample=${samples[$((${SLURM_ARRAY_TASK_ID} - 1))]} 
+
+export TMPDIR=sequence_processing_pipeline/tests/2caa8226-cf69-45a3-bd40-1e90ec3d18d0/TRIntegrateJob/tmp
+
+# get list of samples and determine which sample this array instance will work
+# on.
+samples=($(cat sequence_processing_pipeline/tests/2caa8226-cf69-45a3-bd40-1e90ec3d18d0/TRIntegrateJob/sample_index_list.txt | cut -f 2))
+sample=${samples[$((${SLURM_ARRAY_TASK_ID} - 1))]}
+
+echo "Processing sample ${sample}..."
+
+# make temp directory
+export TMPDIR=sequence_processing_pipeline/tests/2caa8226-cf69-45a3-bd40-1e90ec3d18d0/TRIntegrateJob/tmp
+mkdir -p $TMPDIR
+
+
+# TODO: All three input files must be non-zero in length.
+# If possible, do this check as part of normal FSR operation.
+# Previously this was done right here BEFORE integrating, rather
+# than after.
+
+# NB: non-zero file-length check removed for now. This should be performed
+# by FSR after processing is done.
+# TODO: Make sure raw_fastq_dir is TellReadJob/Full
+r1_in=sequence_processing_pipeline/tests/2caa8226-cf69-45a3-bd40-1e90ec3d18d0/TellReadJob/Full/TellReadJob_R1_${sample}.fastq.gz.corrected.err_barcode_removed.fastq
+r2_in=sequence_processing_pipeline/tests/2caa8226-cf69-45a3-bd40-1e90ec3d18d0/TellReadJob/Full/TellReadJob_R2_${sample}.fastq.gz.corrected.err_barcode_removed.fastq
+i1_in=sequence_processing_pipeline/tests/2caa8226-cf69-45a3-bd40-1e90ec3d18d0/TellReadJob/Full/TellReadJob_I1_${sample}.fastq.gz.corrected.err_barcode_removed.fastq
+
+# create output directory
+mkdir -p sequence_processing_pipeline/tests/2caa8226-cf69-45a3-bd40-1e90ec3d18d0/TRIntegrateJob/integrated
+
+# generate output file names
+r1_out=sequence_processing_pipeline/tests/2caa8226-cf69-45a3-bd40-1e90ec3d18d0/TRIntegrateJob/integrated/${sample}.R1.fastq.gz
+r2_out=sequence_processing_pipeline/tests/2caa8226-cf69-45a3-bd40-1e90ec3d18d0/TRIntegrateJob/integrated/${sample}.R2.fastq.gz
+i1_out=sequence_processing_pipeline/tests/2caa8226-cf69-45a3-bd40-1e90ec3d18d0/TRIntegrateJob/integrated/${sample}.I1.fastq.gz
+
+# generate 'integrated' I1 fastq.gz file. We do this as part of each array so
+# they're done in parallel.
+gzip -c ${i1_in} > ${i1_out}
+
+# generate integrated R1 and R2 fastq.gz files.
+conda activate qp-knight-lab-processing-2022.03
+
+python sequence_processing_pipeline/contrib/integrate-indices-np.py integrate \
+--no-sort \
+--r1-in ${r1_in} \
+--r2-in ${r2_in} \
+--i1-in ${i1_in} \
+--r1-out ${r1_out} \
+--r2-out ${r2_out} \
+--threads 4
\ No newline at end of file
diff --git a/sequence_processing_pipeline/tests/data/tellseq_output/tellread_test.sbatch b/sequence_processing_pipeline/tests/data/tellseq_output/tellread_test.sbatch
new file mode 100644
index 00000000..9dc3ccff
--- /dev/null
+++ b/sequence_processing_pipeline/tests/data/tellseq_output/tellread_test.sbatch
@@ -0,0 +1,47 @@
+#!/bin/bash -l
+#SBATCH -J tellread
+#SBATCH -p qiita
+#SBATCH -N 1
+#SBATCH -c 4
+#SBATCH --mem 16G
+#SBATCH --time 96:00:00
+
+#SBATCH --output sequence_processing_pipeline/tests/2caa8226-cf69-45a3-bd40-1e90ec3d18d0/TellReadJob/logs/tellread_%x-%A.out
+#SBATCH --error sequence_processing_pipeline/tests/2caa8226-cf69-45a3-bd40-1e90ec3d18d0/TellReadJob/logs/tellread_%x-%A.err
+
+set -x
+
+module load singularity_3.6.4
+$HOME/qiita-spots/tellread-release-novaseqX/run_tellread_sing.sh \
+    -i sequence_processing_pipeline/tests/data/sample_run_directories/150629_SN1001_0511_AH5L7GBCXX \
+    -o sequence_processing_pipeline/tests/2caa8226-cf69-45a3-bd40-1e90ec3d18d0/TellReadJob \
+    -s $(echo C501,C509,C502,C510,C503,C511,C504,C512,C505,C513,C506,C514,C507,C515,C508,C516,C517,C525,C518,C526,C519,C527,C520,C528,C521,C529,C522,C530,C523,C531,C524,C532,C533,C541,C534,C542,C535,C543,C536,C544,C537,C545,C538,C546,C539,C547,C540,C548,C549,C557,C550,C558,C551,C559,C552,C560,C553,C561,C554,C562,C555,C563,C556,C564,C565,C573,C566,C574,C567,C575,C568,C576,C569,C577,C570,C578,C571,C579,C572,C580,C581,C589,C582,C590,C583,C591,C584,C592,C585,C593,C586,C594,C587,C595,C588,C596 | tr -d '"') \
+    -g $(echo NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE | tr -d '"') \
+    -j ${SLURM_JOB_CPUS_PER_NODE}  \
+    -l s_1
+
+# get the timestamp for the most recently changed file in directory '.'
+
+# hard-limit for wait time set to ~ 8 hours.
+# (4 checks per hour, for 8 hours equals 32 iterations)
+for i in $(seq 1 32);
+do
+    before="$(find sequence_processing_pipeline/tests/2caa8226-cf69-45a3-bd40-1e90ec3d18d0/TellReadJob/Full -type f -printf '%T@\n' | sort -n | tail -1)"
+    # assume TellReadJob is finished if ctime hasn't changed in 15 minutes
+    # for any fastq file in the directory.
+    sleep 900
+    after="$(find sequence_processing_pipeline/tests/2caa8226-cf69-45a3-bd40-1e90ec3d18d0/TellReadJob/Full -type f -printf '%T@\n' | sort -n | tail -1)"
+
+    echo "$before   $after"
+
+    if [[ "$before" == "$after" ]]; then
+        echo "DONE"
+        exit 0
+    else
+        echo "NOT DONE"
+    fi
+done
+
+# if we've reached this point then we've exceeded our hard-limit for waiting.
+# return w/an error.
+exit 1
diff --git a/sequence_processing_pipeline/tests/test_ConvertJob.py b/sequence_processing_pipeline/tests/test_ConvertJob.py
index e8879864..f394cfb8 100644
--- a/sequence_processing_pipeline/tests/test_ConvertJob.py
+++ b/sequence_processing_pipeline/tests/test_ConvertJob.py
@@ -955,7 +955,7 @@ def test_error_msg_from_logs(self):
 
         # an internal method to force submit_job() to raise a JobFailedError
         # instead of submitting the job w/sbatch and waiting for a failed
-        # job w/sacct.
+        # job w/squeue.
         self.assertTrue(job._toggle_force_job_fail())
 
         error_msg = ("This job died.\n2024-01-01T12:12:12Z thread 99999 ERROR:"
diff --git a/sequence_processing_pipeline/tests/test_FastQCJob.py b/sequence_processing_pipeline/tests/test_FastQCJob.py
index 28fe52cb..a2291296 100644
--- a/sequence_processing_pipeline/tests/test_FastQCJob.py
+++ b/sequence_processing_pipeline/tests/test_FastQCJob.py
@@ -1121,7 +1121,7 @@ def test_error_msg_from_logs(self):
 
         # an internal method to force submit_job() to raise a JobFailedError
         # instead of submitting the job w/sbatch and waiting for a failed
-        # job w/sacct.
+        # job w/squeue.
         self.assertTrue(job._toggle_force_job_fail())
 
         try:
diff --git a/sequence_processing_pipeline/tests/test_Job.py b/sequence_processing_pipeline/tests/test_Job.py
index 7aa5889a..192709b6 100644
--- a/sequence_processing_pipeline/tests/test_Job.py
+++ b/sequence_processing_pipeline/tests/test_Job.py
@@ -1,10 +1,10 @@
 import unittest
 from sequence_processing_pipeline.Job import Job
 from sequence_processing_pipeline.PipelineError import PipelineError
-from os.path import abspath, join, dirname
-from os import makedirs
+from os.path import abspath, join, dirname, split, isdir
+from os import makedirs, chmod, remove
 from functools import partial
-from shutil import rmtree
+from shutil import rmtree, copyfile
 import re
 
 
@@ -14,7 +14,10 @@ def setUp(self):
 
     def tearDown(self):
         for some_path in self.remove_these:
-            rmtree(some_path)
+            if isdir(some_path):
+                rmtree(some_path)
+            else:
+                remove(some_path)
 
     def test_system_call(self):
         package_root = abspath('./sequence_processing_pipeline')
@@ -123,6 +126,168 @@ def test_extract_project_names_from_fastq_dir(self):
         obs = job.extract_project_names_from_fastq_dir(tmp)
         self.assertEqual(obs, ['NPH_15288'])
 
+    def test_query_slurm(self):
+        package_root = abspath('./sequence_processing_pipeline')
+        base_path = partial(join, package_root, 'tests', 'data')
+
+        # set up a fake job so that we can test the query_jobs() method.
+        # it doesn't matter what the parameters are so long as the job
+        # passes initialization.
+        job = Job(base_path('211021_A00000_0000_SAMPLE'),
+                  base_path('7b9d7d9c-2cd4-4d54-94ac-40e07a713585'),
+                  '200nnn_xnnnnn_nnnn_xxxxxxxxxx', ['ls'], 2, None)
+
+        # locate python binary path
+        # we have a python script called fake_squeue.py that can simulate
+        # repeated calls to squeue. It does this by generating a fake random
+        # set of array job ids for each job id passed to it and records their
+        # state in my_state.json. Each array job is set to change state from
+        # RUNNING to either COMPLETED or FAILED between three to seven squeue
+        # calls. The choice of which job-ids will succeed or fail, as is which
+        # individual array-ids will succeed or fail is random.
+        python_path = split(job._which('python'))[0]
+        squeue_path = join(python_path, 'squeue')
+        foo = join(package_root, 'scripts', 'fake_squeue.py')
+
+        # place the fake squeue file in a place that's known to be in the
+        # PATH. Make sure this file is removed after this test is complete.
+        # Also make sure the saved state file is removed.
+        copyfile(foo, squeue_path)
+        chmod(squeue_path, 0o755)
+        self.remove_these.append(squeue_path)
+        self.remove_these.append(join(package_root, 'scripts',
+                                      'my_state.json'))
+
+        job_ids = ['1234567', '1234568', '1234569', '1234570']
+        jobs = job._query_slurm(job_ids)
+
+        # jobs is a dictionary of unique array_ids and/or job-ids for non-
+        # array jobs. The faked squeue reports anywhere between five and
+        # fifteen array jobs for a given job-id. After the first invocation
+        # all processes should be in the 'RUNNING' state.
+        # e.g.: "1234567_1": "RUNNING"
+
+        for j in jobs:
+            self.assertEqual(jobs[j], 'RUNNING')
+            if '_' in j:
+                jid, aid = j.split('_')
+            else:
+                jid = j
+                aid = None
+
+            # assert the job id component of the array-id is a valid job id.
+            self.assertIn(jid, job_ids)
+
+            if aid:
+                # assert the array-id component of the array-id is between 0
+                # and 15 as defined in the fake squeue script.
+                aid = int(aid)
+                self.assertLess(aid, 15)
+                self.assertGreaterEqual(aid, 0)
+
+    def test_query_slurm_single_job(self):
+        # perform test_query_slurm() but with a single job only.
+        package_root = abspath('./sequence_processing_pipeline')
+        base_path = partial(join, package_root, 'tests', 'data')
+
+        # set up a fake job so that we can test the query_jobs() method.
+        # it doesn't matter what the parameters are so long as the job
+        # passes initialization.
+        job = Job(base_path('211021_A00000_0000_SAMPLE'),
+                  base_path('7b9d7d9c-2cd4-4d54-94ac-40e07a713585'),
+                  '200nnn_xnnnnn_nnnn_xxxxxxxxxx', ['ls'], 2, None)
+
+        # locate python binary path
+        # we have a python script called fake_squeue.py that can simulate
+        # repeated calls to squeue. It does this by generating a fake random
+        # set of array job ids for each job id passed to it and records their
+        # state in my_state.json. Each array job is set to change state from
+        # RUNNING to either COMPLETED or FAILED between three to seven squeue
+        # calls. The choice of which job-ids will succeed or fail, as is which
+        # individual array-ids will succeed or fail is random.
+        python_path = split(job._which('python'))[0]
+        squeue_path = join(python_path, 'squeue')
+        foo = join(package_root, 'scripts', 'fake_squeue.py')
+
+        # place the fake squeue file in a place that's known to be in the
+        # PATH. Make sure this file is removed after this test is complete.
+        # Also make sure the saved state file is removed.
+        copyfile(foo, squeue_path)
+        chmod(squeue_path, 0o755)
+        self.remove_these.append(squeue_path)
+        self.remove_these.append(join(package_root, 'scripts',
+                                      'my_state.json'))
+
+        job_ids = ['1234567']
+        jobs = job._query_slurm(job_ids)
+
+        # jobs is a dictionary of unique array_ids and/or job-ids for non-
+        # array jobs. The faked squeue reports anywhere between five and
+        # fifteen array jobs for a given job-id. After the first invocation
+        # all processes should be in the 'RUNNING' state.
+        # e.g.: "1234567_1": "RUNNING"
+
+        for j in jobs:
+            self.assertEqual(jobs[j], 'RUNNING')
+            if '_' in j:
+                jid, aid = j.split('_')
+            else:
+                jid = j
+                aid = None
+
+            # assert the job id component of the array-id is a valid job id.
+            self.assertIn(jid, job_ids)
+
+            if aid:
+                # assert the array-id component of the array-id is between 0
+                # and 15 as defined in the fake squeue script.
+                aid = int(aid)
+                self.assertLess(aid, 15)
+                self.assertGreaterEqual(aid, 0)
+
+    def test_wait_on_job_ids(self):
+        package_root = abspath('./sequence_processing_pipeline')
+        base_path = partial(join, package_root, 'tests', 'data')
+
+        job = Job(base_path('211021_A00000_0000_SAMPLE'),
+                  base_path('7b9d7d9c-2cd4-4d54-94ac-40e07a713585'),
+                  '200nnn_xnnnnn_nnnn_xxxxxxxxxx', ['ls'], 2, None)
+
+        python_path = split(job._which('python'))[0]
+        squeue_path = join(python_path, 'squeue')
+        foo = join(package_root, 'scripts', 'fake_squeue.py')
+        copyfile(foo, squeue_path)
+        chmod(squeue_path, 0o755)
+        self.remove_these.append(squeue_path)
+        self.remove_these.append(join(package_root, 'scripts',
+                                      'my_state.json'))
+
+        job_ids = ['1', '2', '3', '4']
+
+        # to shorten the test time, set polling_interval_in_seconds to be
+        # lower than one minute.
+        Job.polling_interval_in_seconds = 10
+        results = job.wait_on_job_ids(job_ids)
+
+        # calling query_slurm one more time after wait_on_job_ids() is called
+        # will technically advance the counter one more, which means that this
+        # doesn't confirm that wait_on_job_ids() doesn't return before EVERY
+        # single job is either COMPLETED or FAILED. However it does confirm
+        # that wait_on_job_ids() doesn't return once the FIRST completed array
+        # job is either COMPLETED or FAILED while others are still RUNNING.
+        # This was previously an issue.
+        obs = job._query_slurm(job_ids)
+
+        for array_id in obs:
+            state = obs[array_id]
+            # w/out relying on states defined in Job, simply confirm all are
+            # either COMPLETED or FAILED.
+            self.assertIn(state, ['COMPLETED', 'FAILED'])
+
+        # since wait_on_job_ids() now returns the same data structure as
+        # query_slurm(), they should be equal.
+        self.assertDictEqual(obs, results)
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/sequence_processing_pipeline/tests/test_NuQCJob.py b/sequence_processing_pipeline/tests/test_NuQCJob.py
index a3cfdada..8737552b 100644
--- a/sequence_processing_pipeline/tests/test_NuQCJob.py
+++ b/sequence_processing_pipeline/tests/test_NuQCJob.py
@@ -9,7 +9,7 @@
 )
 from os import makedirs, remove
 from metapool import load_sample_sheet
-import glob
+from os import walk
 
 
 class TestNuQCJob(unittest.TestCase):
@@ -996,7 +996,7 @@ def test_error_msg_from_logs(self):
 
         # an internal method to force submit_job() to raise a JobFailedError
         # instead of submitting the job w/sbatch and waiting for a failed
-        # job w/sacct.
+        # job w/squeue.
         self.assertTrue(job._toggle_force_job_fail())
 
         try:
@@ -2208,10 +2208,56 @@ def test_generate_mmi_filter_cmds_w_annotate_fastq(self):
         self.assertEqual(obs, exp)
 
     def test_move_trimmed(self):
-        # Note: this test does not make use of the output_dir that other
-        # tests use.
+        # create a NuQCJob() object, but do not call run().
+        # instead we will manually create some files to test with.
+        double_db_paths = ["db_path/mmi_1.db", "db_path/mmi_2.db"]
+        job = NuQCJob(
+            self.fastq_root_path,
+            self.output_path,
+            self.good_sample_sheet_path,
+            double_db_paths,
+            "queue_name",
+            1,
+            1440,
+            "8",
+            "fastp",
+            "minimap2",
+            "samtools",
+            [],
+            self.qiita_job_id,
+            1000,
+            "",
+            self.movi_path,
+            self.gres_value,
+            self.pmls_path,
+            ['BX']
+        )
 
-        for dummy_fp in SAMPLE_DIR:
+        sample_dir = [
+            "NuQCJob/only-adapter-filtered/EP890158A02_S58_L001_R1_001."
+            "interleave.fastq.gz",
+            "NuQCJob/only-adapter-filtered/EP890158A02_S58_L001_R2_001."
+            "interleave.fastq.gz",
+            "NuQCJob/only-adapter-filtered/EP023801B04_S27_L001_R1_001."
+            "interleave.fastq.gz",
+            "NuQCJob/only-adapter-filtered/EP023801B04_S27_L001_R2_001."
+            "interleave.fastq.gz",
+            "NuQCJob/NPH_15288/fastp_reports_dir/html/EP890158A02_S58_L001_"
+            "R1_001.html",
+            "NuQCJob/NPH_15288/fastp_reports_dir/json/EP023801B04_S27_L001_"
+            "R1_001.json",
+            "NuQCJob/process_all_fastq_files.sh",
+            "NuQCJob/hds-a439513a-5fcc-4f29-a1e5-902ee5c1309d.1897981."
+            "completed",
+            "NuQCJob/logs/slurm-1897981_1.out",
+            "NuQCJob/tmp/hds-a439513a-5fcc-4f29-a1e5-902ee5c1309d-1",
+            'NuQCJob/only-adapter-filtered/CDPH-SAL_'
+            'Salmonella_Typhi_MDL-150__S36_L001_R1_001.interleave.fastq.gz',
+            'NuQCJob/only-adapter-filtered/CDPH-SAL_'
+            'Salmonella_Typhi_MDL-150__S36_L001_R2_001.interleave.fastq.gz',
+        ]
+
+        for dummy_fp in sample_dir:
             dummy_fp = self.path(dummy_fp)
             dummy_path = dirname(dummy_fp)
             makedirs(dummy_path, exist_ok=True)
@@ -2220,38 +2266,33 @@ def test_move_trimmed(self):
 
         trimmed_only_path = self.path("NuQCJob", "only-adapter-filtered")
 
-        NuQCJob._move_trimmed_files("NPH_15288", trimmed_only_path)
-
-        new_path = join(trimmed_only_path, "NPH_15288")
-        pattern = f"{new_path}/*.fastq.gz"
-
-        exp = [
-            (
-                "only-adapter-filtered/NPH_15288/359180345_S58_L001_R1_001."
-                "fastq.gz"
-            ),
-            (
-                "only-adapter-filtered/NPH_15288/359180337_S27_L001_R1_001."
-                "fastq.gz"
-            ),
-            (
-                "only-adapter-filtered/NPH_15288/359180338_S51_L001_R2_001."
-                "fastq.gz"
-            ),
-            (
-                "only-adapter-filtered/NPH_15288/359180338_S51_L001_R1_001."
-                "fastq.gz"
-            ),
-            (
-                "only-adapter-filtered/NPH_15288/359180337_S27_L001_R2_001."
-                "fastq.gz"
-            ),
-        ]
-
-        for trimmed_file in list(glob.glob(pattern)):
-            trimmed_file = trimmed_file.split("NuQCJob/")[-1]
-            if trimmed_file not in exp:
-                self.assertIn(trimmed_file, exp)
+        # test _move_trimmed_files() by verifying that only the interleave
+        # fastq files from the NYU project are moved.
+        job._move_trimmed_files("NYU_BMS_Melanoma_13059", trimmed_only_path)
+
+        new_path = join(trimmed_only_path, "NYU_BMS_Melanoma_13059")
+
+        exp = {
+            'NuQCJob/only-adapter-filtered/NYU_BMS_Melanoma_13059/EP890158A02'
+            '_S58_L001_R1_001.interleave.fastq.gz',
+            'NuQCJob/only-adapter-filtered/NYU_BMS_Melanoma_13059/EP023801B04'
+            '_S27_L001_R1_001.interleave.fastq.gz',
+            'NuQCJob/only-adapter-filtered/NYU_BMS_Melanoma_13059/EP890158A02'
+            '_S58_L001_R2_001.interleave.fastq.gz',
+            'NuQCJob/only-adapter-filtered/NYU_BMS_Melanoma_13059/EP023801B04'
+            '_S27_L001_R2_001.interleave.fastq.gz'
+            }
+
+        obs = []
+        for root, dirs, files in walk(new_path):
+            for some_file in files:
+                some_path = join(root, some_file)
+                some_path = some_path.replace(self.path(""), "")
+                obs.append(some_path)
+
+        # confirm that only the samples in NYU_BMS_Melanoma_13059 were
+        # moved.
+        self.assertEqual(set(obs), exp)
 
     def _helper(self, regex, good_names, bad_names):
         for good_name in good_names:
@@ -2263,27 +2304,5 @@ def _helper(self, regex, good_names, bad_names):
             self.assertIsNone(substr, msg=f"Regex failed on {bad_name}")
 
 
-SAMPLE_DIR = [
-    "NuQCJob/only-adapter-filtered/359180345_S58_L001_R1_001.fastq.gz",
-    "NuQCJob/only-adapter-filtered/359180337_S27_L001_R1_001.fastq.gz",
-    "NuQCJob/only-adapter-filtered/359180338_S51_L001_R2_001.fastq.gz",
-    "NuQCJob/only-adapter-filtered/359180338_S51_L001_R1_001.fastq.gz",
-    "NuQCJob/only-adapter-filtered/359180337_S27_L001_R2_001.fastq.gz",
-    "NuQCJob/NPH_15288/fastp_reports_dir/html/359180354_S22_L001_R1_001.html",
-    "NuQCJob/NPH_15288/fastp_reports_dir/html/359180338_S51_L001_R1_001.html",
-    "NuQCJob/NPH_15288/fastp_reports_dir/html/359180345_S58_L001_R1_001.html",
-    "NuQCJob/NPH_15288/fastp_reports_dir/html/359180337_S27_L001_R1_001.html",
-    "NuQCJob/NPH_15288/fastp_reports_dir/html/359180353_S17_L001_R1_001.html",
-    "NuQCJob/NPH_15288/fastp_reports_dir/json/359180353_S17_L001_R1_001.json",
-    "NuQCJob/NPH_15288/fastp_reports_dir/json/359180337_S27_L001_R1_001.json",
-    "NuQCJob/NPH_15288/fastp_reports_dir/json/359180345_S58_L001_R1_001.json",
-    "NuQCJob/NPH_15288/fastp_reports_dir/json/359180338_S51_L001_R1_001.json",
-    "NuQCJob/NPH_15288/fastp_reports_dir/json/359180354_S22_L001_R1_001.json",
-    "NuQCJob/process_all_fastq_files.sh",
-    "NuQCJob/hds-a439513a-5fcc-4f29-a1e5-902ee5c1309d.1897981.completed",
-    "NuQCJob/logs/slurm-1897981_1.out",
-    "NuQCJob/tmp/hds-a439513a-5fcc-4f29-a1e5-902ee5c1309d-1",
-]
-
 if __name__ == "__main__":
     unittest.main()
diff --git a/sequence_processing_pipeline/tests/test_Pipeline.py b/sequence_processing_pipeline/tests/test_Pipeline.py
index 53267c7d..0af31439 100644
--- a/sequence_processing_pipeline/tests/test_Pipeline.py
+++ b/sequence_processing_pipeline/tests/test_Pipeline.py
@@ -140,7 +140,7 @@ def test_validate_mapping_file_numeric_ids(self):
         with NamedTemporaryFile() as tmp:
             self._make_mapping_file(tmp.name)
             exp = ['1.0', '1e-3']
-            pipeline = Pipeline(self.good_config_file, self.good_run_id, None,
+            pipeline = Pipeline(self.good_config_file, self.good_run_id,
                                 tmp.name, self.output_file_path, self.qiita_id,
                                 Pipeline.AMPLICON_PTYPE)
 
@@ -153,7 +153,7 @@ def test_validate_mapping_file_numeric_ids(self):
 
     def test_get_sample_names_from_sample_sheet(self):
         pipeline = Pipeline(self.good_config_file, self.good_run_id,
-                            self.mp_sheet_path, None,
+                            self.mp_sheet_path,
                             self.output_file_path, self.qiita_id,
                             Pipeline.METAGENOMIC_PTYPE)
 
@@ -178,7 +178,7 @@ def test_get_sample_names_from_sample_sheet(self):
 
     def test_get_orig_names_from_sheet_with_replicates(self):
         pipeline = Pipeline(self.good_config_file, self.good_run_id,
-                            self.good_sheet_w_replicates, None,
+                            self.good_sheet_w_replicates,
                             self.output_file_path, self.qiita_id,
                             Pipeline.METAGENOMIC_PTYPE)
 
@@ -198,7 +198,7 @@ def test_required_file_checks(self):
         with self.assertRaisesRegex(PipelineError, "required file 'RunInfo.xml"
                                                    "' is not present."):
             Pipeline(self.good_config_file, self.good_run_id,
-                     self.good_sample_sheet_path, None,
+                     self.good_sample_sheet_path,
                      self.output_file_path,
                      self.qiita_id, Pipeline.METAGENOMIC_PTYPE)
 
@@ -210,7 +210,7 @@ def test_required_file_checks(self):
         with self.assertRaisesRegex(PipelineError, "required file 'RTAComplete"
                                                    ".txt' is not present."):
             Pipeline(self.good_config_file, self.good_run_id,
-                     self.good_sample_sheet_path, None,
+                     self.good_sample_sheet_path,
                      self.output_file_path,
                      self.qiita_id, Pipeline.METAGENOMIC_PTYPE)
 
@@ -222,7 +222,7 @@ def test_required_file_checks(self):
         with self.assertRaisesRegex(PipelineError, "RunInfo.xml is present, bu"
                                                    "t not readable"):
             Pipeline(self.good_config_file, self.good_run_id,
-                     self.good_sample_sheet_path, None,
+                     self.good_sample_sheet_path,
                      self.output_file_path,
                      self.qiita_id, Pipeline.METAGENOMIC_PTYPE)
         self.make_runinfo_file_readable()
@@ -232,7 +232,7 @@ def test_creation(self):
         with self.assertRaises(PipelineError) as e:
             Pipeline(self.bad_config_file,
                      self.good_run_id,
-                     self.good_sample_sheet_path, None,
+                     self.good_sample_sheet_path,
                      self.output_file_path,
                      self.qiita_id, Pipeline.METAGENOMIC_PTYPE)
 
@@ -249,7 +249,7 @@ def test_creation(self):
                                                 " valid sample-sheet."):
             Pipeline(self.good_config_file,
                      self.good_run_id,
-                     self.bad_assay_type_path, None,
+                     self.bad_assay_type_path,
                      self.output_file_path,
                      self.qiita_id, Pipeline.METAGENOMIC_PTYPE)
 
@@ -257,7 +257,7 @@ def test_creation(self):
         with self.assertRaises(PipelineError) as e:
             Pipeline(self.invalid_config_file,
                      self.good_run_id,
-                     self.good_sample_sheet_path, None,
+                     self.good_sample_sheet_path,
                      self.output_file_path,
                      self.qiita_id, Pipeline.METAGENOMIC_PTYPE)
 
@@ -268,7 +268,7 @@ def test_creation(self):
         with self.assertRaises(PipelineError) as e:
             Pipeline(None,
                      self.good_run_id,
-                     self.good_sample_sheet_path, None,
+                     self.good_sample_sheet_path,
                      self.output_file_path,
                      self.qiita_id, Pipeline.METAGENOMIC_PTYPE)
 
@@ -279,7 +279,7 @@ def test_creation(self):
         with self.assertRaises(PipelineError) as e:
             Pipeline(self.good_config_file,
                      self.invalid_run_id,
-                     self.good_sample_sheet_path, None,
+                     self.good_sample_sheet_path,
                      self.output_file_path,
                      self.qiita_id, Pipeline.METAGENOMIC_PTYPE)
 
@@ -290,7 +290,7 @@ def test_creation(self):
         with self.assertRaises(PipelineError) as e:
             Pipeline(self.good_config_file,
                      None,
-                     self.good_sample_sheet_path, None,
+                     self.good_sample_sheet_path,
                      self.output_file_path,
                      self.qiita_id, Pipeline.METAGENOMIC_PTYPE)
 
@@ -300,7 +300,7 @@ def test_creation(self):
                                                    "not a valid json file"):
             Pipeline(self.good_sample_sheet_path,
                      self.good_run_id,
-                     self.good_sample_sheet_path, None,
+                     self.good_sample_sheet_path,
                      self.output_file_path,
                      self.qiita_id, Pipeline.METAGENOMIC_PTYPE)
 
@@ -323,7 +323,7 @@ def test_creation(self):
                                                 "bad.json'"):
             Pipeline(self.good_config_file,
                      self.good_run_id,
-                     self.good_sample_sheet_path, None,
+                     self.good_sample_sheet_path,
                      self.output_file_path,
                      self.qiita_id, Pipeline.METAGENOMIC_PTYPE)
 
@@ -345,7 +345,7 @@ def test_creation(self):
                                                 "bad.json'"):
             Pipeline(self.good_config_file,
                      self.good_run_id,
-                     self.good_sample_sheet_path, None,
+                     self.good_sample_sheet_path,
                      self.output_file_path,
                      self.qiita_id, Pipeline.METAGENOMIC_PTYPE)
 
@@ -368,7 +368,7 @@ def test_creation(self):
                                                 "bad.json'"):
             Pipeline(self.good_config_file,
                      self.good_run_id,
-                     self.good_sample_sheet_path, None,
+                     self.good_sample_sheet_path,
                      self.output_file_path,
                      self.qiita_id, Pipeline.METAGENOMIC_PTYPE)
 
@@ -379,7 +379,7 @@ def test_sample_sheet_validation(self):
         # contained w/in its 'message' member.
         try:
             Pipeline(self.good_config_file, self.good_run_id,
-                     self.good_sample_sheet_path, None,
+                     self.good_sample_sheet_path,
                      self.output_file_path,
                      self.qiita_id, Pipeline.METAGENOMIC_PTYPE)
         except PipelineError as e:
@@ -389,7 +389,7 @@ def test_sample_sheet_validation(self):
         # test unsuccessful validation of a bad sample-sheet
         with self.assertRaises(PipelineError) as e:
             Pipeline(self.good_config_file, self.good_run_id,
-                     self.bad_sample_sheet_path, None,
+                     self.bad_sample_sheet_path,
                      self.output_file_path,
                      self.qiita_id, Pipeline.METAGENOMIC_PTYPE)
         self.assertEqual(str(e.exception), ('Sample-sheet contains errors:\n'
@@ -401,7 +401,6 @@ def test_generate_sample_information_files(self):
         # test sample-information-file generation.
         pipeline = Pipeline(self.good_config_file, self.good_run_id,
                             self.good_sample_sheet_path,
-                            None,
                             self.output_file_path, self.qiita_id,
                             Pipeline.METAGENOMIC_PTYPE)
 
@@ -511,6 +510,62 @@ def test_generate_sample_information_files(self):
                 exp = exp_last_lines[some_name]
                 self.assertEqual(obs, exp)
 
+    def test_generate_sample_information_files_with_additional_meta(self):
+        # TODO: With recent changes is this needed?
+        # test sample-information-file generation.
+        pipeline = Pipeline(self.good_config_file, self.good_run_id,
+                            self.good_sample_sheet_path,
+                            self.output_file_path, self.qiita_id,
+                            Pipeline.METAGENOMIC_PTYPE)
+
+        # create a dataframe with duplicate information to pass to
+        # generate_sample_information_files(). Confirm that the duplicates
+        # are dropped. Confirm 'NOTBLANK_999A' is also filtered out.
+        df = pd.DataFrame(data=[('BLANK999_999A', 'NYU_BMS_Melanoma_13059'),
+                                ('BLANK999_999A', 'NYU_BMS_Melanoma_13059'),
+                                ('NOTBLANK_999A', 'NYU_BMS_Melanoma_13059')],
+                          columns=['sample_name', 'project_name'])
+
+        sif_path = pipeline.generate_sample_info_files(addl_info=df)
+
+        # get the path for the NYU_BMS_Melanoma dataset.
+        sif_path = [x for x in sif_path if 'NYU_BMS_Melanoma' in x][0]
+
+        exp_first_line = ("BLANK1.1A\t2021-10-21\t193\t"
+                          "Control\tNegative\tSterile water blank\t"
+                          "Sterile water blank\turban biome\t"
+                          "research facility\tsterile water\t"
+                          "misc environment\tUSA:CA:San Diego\t"
+                          "BLANK1.1A\t32.5\t-117.25\tcontrol blank\t"
+                          "metagenome\t256318\tBLANK1.1A\t"
+                          "NYU_BMS_Melanoma\tTRUE\tUCSD\tFALSE")
+
+        exp_last_line = ("BLANK4.4H\t2021-10-21\t193\tControl\tNegative\t"
+                         "Sterile water blank\tSterile water blank\t"
+                         "urban biome\tresearch facility\tsterile water\t"
+                         "misc environment\tUSA:CA:San Diego\tBLANK4.4H\t"
+                         "32.5\t-117.25\tcontrol blank\tmetagenome\t256318\t"
+                         "BLANK4.4H\tNYU_BMS_Melanoma\tTRUE\tUCSD\tFALSE")
+
+        with open(sif_path, 'r') as f:
+            obs_lines = f.readlines()
+
+            # confirm that each file contains the expected header.
+            header = obs_lines[0].strip()
+            self.assertEqual(header, '\t'.join(Pipeline.sif_header))
+
+            # confirm that the first line of each file is as expected.
+            obs = obs_lines[1].strip()
+            exp = exp_first_line
+
+            self.assertEqual(obs, exp)
+
+            # confirm that the last line of each file is as expected.
+            obs = obs_lines[-1].strip()
+            exp = exp_last_line
+
+            self.assertEqual(obs, exp)
+
     def test_get_sample_ids(self):
         exp_sample_ids = ['CDPH-SAL__Salmonella__Typhi__MDL-143',
                           'CDPH-SAL_Salmonella_Typhi_MDL-144',
@@ -980,7 +1035,7 @@ def test_get_sample_ids(self):
                           'EP400448B04', 'EP479894B04']
         # test sample-information-file generation.
         pipeline = Pipeline(self.good_config_file, self.good_run_id,
-                            self.good_sample_sheet_path, None,
+                            self.good_sample_sheet_path,
                             self.output_file_path, self.qiita_id,
                             Pipeline.METAGENOMIC_PTYPE)
 
@@ -1456,7 +1511,7 @@ def test_get_sample_names(self):
 
         # test sample-information-file generation.
         pipeline = Pipeline(self.good_config_file, self.good_run_id,
-                            self.good_sample_sheet_path, None,
+                            self.good_sample_sheet_path,
                             self.output_file_path, self.qiita_id,
                             Pipeline.METAGENOMIC_PTYPE)
 
@@ -1484,7 +1539,7 @@ def test_get_project_info(self):
 
         # test sample-information-file generation.
         pipeline = Pipeline(self.good_config_file, self.good_run_id,
-                            self.good_sample_sheet_path, None,
+                            self.good_sample_sheet_path,
                             self.output_file_path, self.qiita_id,
                             Pipeline.METAGENOMIC_PTYPE)
 
@@ -1515,7 +1570,7 @@ def test_get_project_info(self):
         self.assertEqual(sorted(obs_project_names), sorted(exp_project_names))
 
         pipeline = Pipeline(self.good_config_file, self.good_run_id,
-                            self.good_sheet_w_replicates, None,
+                            self.good_sheet_w_replicates,
                             self.output_file_path, self.qiita_id,
                             Pipeline.METAGENOMIC_PTYPE)
 
@@ -1527,7 +1582,7 @@ def test_get_project_info(self):
 
     def test_configuration_profiles(self):
         pipeline = Pipeline(self.good_config_file, self.good_run_id,
-                            self.good_sample_sheet_path, None,
+                            self.good_sample_sheet_path,
                             self.output_file_path, self.qiita_id,
                             Pipeline.METAGENOMIC_PTYPE)
 
@@ -1556,7 +1611,7 @@ def test_configuration_profiles(self):
     def test_parse_project_name(self):
         # test sample-information-file generation.
         pipeline = Pipeline(self.good_config_file, self.good_run_id,
-                            self.good_sample_sheet_path, None,
+                            self.good_sample_sheet_path,
                             self.output_file_path, self.qiita_id,
                             Pipeline.METAGENOMIC_PTYPE)
 
@@ -1588,7 +1643,7 @@ def test_parse_project_name(self):
 
     def test_identify_reserved_words(self):
         pipeline = Pipeline(self.good_config_file, self.good_run_id,
-                            self.good_sample_sheet_path, None,
+                            self.good_sample_sheet_path,
                             self.output_file_path, self.qiita_id,
                             Pipeline.METAGENOMIC_PTYPE)
 
@@ -1605,7 +1660,7 @@ def test_identify_reserved_words(self):
 
         # create new pipeline using a/legacy (v90) metagenomic sample-sheet.
         pipeline = Pipeline(self.good_config_file, self.good_run_id,
-                            self.good_legacy_sheet_path, None,
+                            self.good_legacy_sheet_path,
                             self.output_file_path, self.qiita_id,
                             Pipeline.METAGENOMIC_PTYPE)
 
@@ -1698,7 +1753,7 @@ def test_required_file_checks(self):
         with self.assertRaisesRegex(PipelineError, "required file 'RunInfo.xml"
                                                    "' is not present."):
             Pipeline(self.good_config_file, self.good_run_id,
-                     None, self.good_mapping_file_path,
+                     self.good_mapping_file_path,
                      self.output_file_path,
                      self.qiita_id, Pipeline.AMPLICON_PTYPE)
 
@@ -1710,7 +1765,7 @@ def test_required_file_checks(self):
         with self.assertRaisesRegex(PipelineError, "required file 'RTAComplete"
                                                    ".txt' is not present."):
             Pipeline(self.good_config_file, self.good_run_id,
-                     None, self.good_mapping_file_path,
+                     self.good_mapping_file_path,
                      self.output_file_path,
                      self.qiita_id, Pipeline.AMPLICON_PTYPE)
 
@@ -1721,7 +1776,7 @@ def test_required_file_checks(self):
 
         with self.assertRaisesRegex(PipelineError, "RunInfo.xml is present, "
                                                    "but not readable"):
-            Pipeline(self.good_config_file, self.good_run_id, None,
+            Pipeline(self.good_config_file, self.good_run_id,
                      self.good_mapping_file_path, self.output_file_path,
                      self.qiita_id, Pipeline.AMPLICON_PTYPE)
             self.make_runinfo_file_readable()
@@ -1731,7 +1786,7 @@ def test_creation(self):
         with self.assertRaises(PipelineError) as e:
             Pipeline(self.bad_config_file,
                      self.good_run_id,
-                     None, self.good_mapping_file_path,
+                     self.good_mapping_file_path,
                      self.output_file_path,
                      self.qiita_id, Pipeline.AMPLICON_PTYPE)
 
@@ -1746,7 +1801,7 @@ def test_creation(self):
         with self.assertRaises(PipelineError) as e:
             Pipeline(self.invalid_config_file,
                      self.good_run_id,
-                     None, self.good_mapping_file_path,
+                     self.good_mapping_file_path,
                      self.output_file_path,
                      self.qiita_id, Pipeline.AMPLICON_PTYPE)
 
@@ -1757,7 +1812,7 @@ def test_creation(self):
         with self.assertRaises(PipelineError) as e:
             Pipeline(None,
                      self.good_run_id,
-                     None, self.good_mapping_file_path,
+                     self.good_mapping_file_path,
                      self.output_file_path,
                      self.qiita_id, Pipeline.AMPLICON_PTYPE)
 
@@ -1768,7 +1823,7 @@ def test_creation(self):
         with self.assertRaises(PipelineError) as e:
             Pipeline(self.good_config_file,
                      self.invalid_run_id,
-                     None, self.good_mapping_file_path,
+                     self.good_mapping_file_path,
                      self.output_file_path,
                      self.qiita_id, Pipeline.AMPLICON_PTYPE)
 
@@ -1779,7 +1834,7 @@ def test_creation(self):
         with self.assertRaises(PipelineError) as e:
             Pipeline(self.good_config_file,
                      None,
-                     None, self.good_mapping_file_path,
+                     self.good_mapping_file_path,
                      self.output_file_path,
                      self.qiita_id, Pipeline.AMPLICON_PTYPE)
 
@@ -1787,7 +1842,7 @@ def test_mapping_file_validation(self):
         # test successful validation of a good mapping-file.
         try:
             Pipeline(self.good_config_file, self.good_run_id,
-                     None, self.good_mapping_file_path,
+                     self.good_mapping_file_path,
                      self.output_file_path,
                      self.qiita_id, Pipeline.AMPLICON_PTYPE)
         except PipelineError as e:
@@ -1797,7 +1852,7 @@ def test_mapping_file_validation(self):
         # test unsuccessful validation of a bad mapping-file.
         with self.assertRaises(PipelineError) as e:
             Pipeline(self.good_config_file, self.good_run_id,
-                     None, self.mf_missing_column,
+                     self.mf_missing_column,
                      self.output_file_path,
                      self.qiita_id, Pipeline.AMPLICON_PTYPE)
         self.assertEqual(str(e.exception), ('Mapping-file is missing '
@@ -1807,7 +1862,7 @@ def test_mapping_file_validation(self):
         # test unsuccessful validation of a bad mapping-file.
         with self.assertRaises(PipelineError) as e:
             Pipeline(self.good_config_file, self.good_run_id,
-                     None, self.mf_duplicate_sample,
+                     self.mf_duplicate_sample,
                      self.output_file_path,
                      self.qiita_id, Pipeline.AMPLICON_PTYPE)
         self.assertEqual(str(e.exception), ("Mapping-file contains duplicate "
@@ -1834,7 +1889,6 @@ def test_is_sample_sheet(self):
     def test_generate_sample_information_files(self):
         # test sample-information-file generation.
         pipeline = Pipeline(self.good_config_file, self.good_run_id,
-                            None,
                             self.good_mapping_file_path,
                             self.output_file_path,
                             self.qiita_id,
@@ -2056,7 +2110,6 @@ def test_get_sample_ids(self):
         # test sample-information-file generation.
         pipeline = Pipeline(self.good_config_file,
                             self.good_run_id,
-                            None,
                             self.good_mapping_file_path,
                             self.output_file_path,
                             self.qiita_id,
@@ -2184,7 +2237,6 @@ def test_get_sample_names(self):
         # test sample-information-file generation.
         pipeline = Pipeline(self.good_config_file,
                             self.good_run_id,
-                            None,
                             self.good_mapping_file_path,
                             self.output_file_path,
                             self.qiita_id,
@@ -2208,7 +2260,7 @@ def test_get_project_info(self):
 
         # test sample-information-file generation.
         pipeline = Pipeline(self.good_config_file, self.good_run_id,
-                            None, self.good_mapping_file_path,
+                            self.good_mapping_file_path,
                             self.output_file_path, self.qiita_id,
                             Pipeline.AMPLICON_PTYPE)
 
@@ -2225,33 +2277,12 @@ def test_get_project_info(self):
                     self.assertDictEqual(obs_d, exp_d)
                     break
 
-    def test_additional_constuctor_check(self):
-        with self.assertRaisesRegex(PipelineError, ("sample_sheet_path or "
-                                                    "mapping_file_path must "
-                                                    "be defined, but not "
-                                                    "both.")):
-            Pipeline(self.good_config_file, self.good_run_id,
-                     None, None,
-                     self.output_file_path,
-                     self.qiita_id, Pipeline.AMPLICON_PTYPE)
-
-        with self.assertRaisesRegex(PipelineError, ("sample_sheet_path or "
-                                                    "mapping_file_path must "
-                                                    "be defined, but not "
-                                                    "both.")):
-            Pipeline(self.good_config_file, self.good_run_id,
-                     self.sample_sheet_path,
-                     self.good_mapping_file_path,
-                     self.output_file_path,
-                     self.qiita_id, Pipeline.AMPLICON_PTYPE)
-
     def test_dummy_sheet_generation(self):
         # generate a RunInfo.xml file w/only one indexed read.
         self.create_runinfo_file(four_reads=False)
 
         _ = Pipeline(self.good_config_file,
                      self.good_run_id,
-                     None,
                      self.good_mapping_file_path,
                      self.output_file_path,
                      self.qiita_id,
@@ -2270,7 +2301,6 @@ def test_dummy_sheet_generation(self):
 
         _ = Pipeline(self.good_config_file,
                      self.good_run_id,
-                     None,
                      self.good_mapping_file_path,
                      self.output_file_path,
                      self.qiita_id,
@@ -2290,7 +2320,6 @@ def test_dummy_sheet_generation(self):
     def test_process_run_info_file(self):
         pipeline = Pipeline(self.good_config_file,
                             self.good_run_id,
-                            None,
                             self.good_mapping_file_path,
                             self.output_file_path,
                             self.qiita_id,
@@ -2330,7 +2359,6 @@ def test_process_run_info_file(self):
     def test_identify_reserved_words(self):
         pipeline = Pipeline(self.good_config_file,
                             self.good_run_id,
-                            None,
                             self.good_mapping_file_path,
                             self.output_file_path,
                             self.qiita_id,
diff --git a/sequence_processing_pipeline/tests/test_SeqCountsJob.py b/sequence_processing_pipeline/tests/test_SeqCountsJob.py
new file mode 100644
index 00000000..d641c3b2
--- /dev/null
+++ b/sequence_processing_pipeline/tests/test_SeqCountsJob.py
@@ -0,0 +1,74 @@
+from os.path import join
+from sequence_processing_pipeline.SeqCountsJob import SeqCountsJob
+from functools import partial
+import unittest
+from json import load as json_load
+
+
+class TestSeqCountsJob(unittest.TestCase):
+    def setUp(self):
+        package_root = "sequence_processing_pipeline"
+        self.path = partial(join, package_root, "tests")
+        # where 2caa8226-cf69-45a3-bd40-1e90ec3d18d0 is a random qiita job id.
+        self.exp = self.path('data', 'tellseq_output', 'integrate_test.sbatch')
+
+        # where 150629_SN1001_0511_AH5L7GBCXX is a run-directory that already
+        # exists.
+        self.run_dir = self.path('data', 'sample_run_directories',
+                                 '150629_SN1001_0511_AH5L7GBCXX')
+
+        self.output_path = self.path('2caa8226-cf69-45a3-bd40-1e90ec3d18d0')
+
+        self.files_to_count_path = self.path("data", "files_to_count.txt")
+
+        self.queue_name = "qiita"
+        self.node_count = "1"
+        self.wall_time_limit = "1440"
+        self.jmem = "8"
+        self.modules_to_load = []
+        self.qiita_job_id = "2caa8226-cf69-45a3-bd40-1e90ec3d18d0"
+        self.cores_per_task = "1"
+        self.raw_fastq_dir = join(self.output_path, "TellReadJob", "Full")
+        self.max_array_length = 100
+        self.exp_sbatch_output = self.path("data", "seq_counts.sbatch")
+        self.exp_results = self.path("data",
+                                     "aggregate_counts_results.json")
+
+    def test_creation(self):
+        def compare_files(obs, exp):
+            with open(obs, 'r') as f:
+                obs_lines = f.readlines()
+                obs_lines = [x.strip() for x in obs_lines]
+                obs_lines = [x for x in obs_lines if x != '']
+
+            with open(exp, 'r') as f:
+                exp_lines = f.readlines()
+                exp_lines = [x.strip() for x in exp_lines]
+                exp_lines = [x for x in exp_lines if x != '']
+
+            for obs_line, exp_line in zip(obs_lines, exp_lines):
+                self.assertEqual(obs_line, exp_line)
+
+        # test basic good-path
+        job = SeqCountsJob(self.run_dir, self.output_path, self.queue_name,
+                           self.node_count, self.wall_time_limit, self.jmem,
+                           self.modules_to_load, self.qiita_job_id,
+                           self.max_array_length, self.files_to_count_path,
+                           self.cores_per_task)
+
+        obs = job._generate_job_script()
+
+        compare_files(obs, self.exp_sbatch_output)
+
+        # hack log path so that it points to test data directory rather than
+        # the output directory for a run we didn't run().
+        job.log_path = self.path("data", "seq_counts_logs")
+
+        obs = json_load(open(job._aggregate_counts(), 'r'))
+        exp = json_load(open(self.exp_results, 'r'))
+
+        self.assertDictEqual(obs, exp)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/sequence_processing_pipeline/tests/test_TRIntegrateJob.py b/sequence_processing_pipeline/tests/test_TRIntegrateJob.py
new file mode 100644
index 00000000..17ded346
--- /dev/null
+++ b/sequence_processing_pipeline/tests/test_TRIntegrateJob.py
@@ -0,0 +1,72 @@
+from os.path import join
+from sequence_processing_pipeline.TRIntegrateJob import TRIntegrateJob
+from functools import partial
+import unittest
+
+
+class TestTRIntegrateJob(unittest.TestCase):
+    def setUp(self):
+        package_root = "sequence_processing_pipeline"
+        self.path = partial(join, package_root, "tests")
+        # where 2caa8226-cf69-45a3-bd40-1e90ec3d18d0 is a random qiita job id.
+        self.obs = self.path('2caa8226-cf69-45a3-bd40-1e90ec3d18d0',
+                             'TRIntegrateJob', 'integrate_test.sbatch')
+        self.exp = self.path('data', 'tellseq_output', 'integrate_test.sbatch')
+
+        # where 150629_SN1001_0511_AH5L7GBCXX is a run-directory that already
+        # exists.
+        self.run_dir = self.path('data', 'sample_run_directories',
+                                 '150629_SN1001_0511_AH5L7GBCXX')
+
+        self.output_path = self.path('2caa8226-cf69-45a3-bd40-1e90ec3d18d0')
+
+        self.sample_sheet_path = self.path('data',
+                                           'tellseq_metag_dummy_sample_'
+                                           'sheet.csv')
+
+        self.queue_name = "qiita"
+        self.node_count = "1"
+        self.wall_time_limit = "96:00:00"
+        self.jmem = "16"
+        self.modules_to_load = ["singularity_3.6.4"]
+        self.qiita_job_id = "2caa8226-cf69-45a3-bd40-1e90ec3d18d0"
+        self.label = "150629_SN1001_0511_AH5L7GBCXX-test"
+        self.reference_base = ""
+        self.reference_map = ""
+        self.tmp1_path = join(self.output_path, "TRIntegrateJob", "output",
+                              "tmp1")
+        # reflects location of script on host.
+        self.sing_script_path = ("$HOME/qiita-spots/tellread-release-novaseqX/"
+                                 "run_tellread_sing.sh")
+        self.lane = "1"
+        self.cores_per_task = "4"
+        self.integrate_script_path = join(package_root, "contrib",
+                                          "integrate-indices-np.py")
+        self.sil_path = self.path('data', 'fake_sample_index_list.txt')
+        self.raw_fastq_dir = join(self.output_path, "TellReadJob", "Full")
+
+    def test_creation(self):
+        # test basic good-path
+        job = TRIntegrateJob(self.run_dir, self.output_path,
+                             self.sample_sheet_path, self.queue_name,
+                             self.node_count, self.wall_time_limit,
+                             self.jmem, self.modules_to_load,
+                             self.qiita_job_id, self.integrate_script_path,
+                             self.sil_path, self.raw_fastq_dir,
+                             self.reference_base, self.reference_map,
+                             self.cores_per_task)
+
+        job._generate_job_script()
+
+        with open(self.obs, 'r') as f:
+            obs_lines = f.readlines()
+
+        with open(self.exp, 'r') as f:
+            exp_lines = f.readlines()
+
+        for obs_line, exp_line in zip(obs_lines, exp_lines):
+            self.assertEqual(obs_line, exp_line)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/sequence_processing_pipeline/tests/test_TellReadJob.py b/sequence_processing_pipeline/tests/test_TellReadJob.py
new file mode 100644
index 00000000..4c6a75c2
--- /dev/null
+++ b/sequence_processing_pipeline/tests/test_TellReadJob.py
@@ -0,0 +1,66 @@
+from os.path import join
+from sequence_processing_pipeline.TellReadJob import TellReadJob
+from functools import partial
+import unittest
+
+
+class TestTellReadJob(unittest.TestCase):
+    def setUp(self):
+        package_root = "sequence_processing_pipeline"
+        self.path = partial(join, package_root, "tests")
+        # where 2caa8226-cf69-45a3-bd40-1e90ec3d18d0 is a random qiita job id.
+        self.obs = self.path('2caa8226-cf69-45a3-bd40-1e90ec3d18d0',
+                             'TellReadJob', 'tellread_test.sbatch')
+        self.exp = self.path('data', 'tellseq_output', 'tellread_test.sbatch')
+
+        # where 150629_SN1001_0511_AH5L7GBCXX is a run-directory that already
+        # exists.
+        self.run_dir = self.path('data', 'sample_run_directories',
+                                 '150629_SN1001_0511_AH5L7GBCXX')
+
+        self.output_path = self.path('2caa8226-cf69-45a3-bd40-1e90ec3d18d0')
+
+        self.sample_sheet_path = self.path('data',
+                                           'tellseq_metag_dummy_sample_'
+                                           'sheet.csv')
+
+        self.queue_name = "qiita"
+        self.node_count = "1"
+        self.wall_time_limit = "96:00:00"
+        self.jmem = "16"
+        self.modules_to_load = ["singularity_3.6.4"]
+        self.qiita_job_id = "2caa8226-cf69-45a3-bd40-1e90ec3d18d0"
+        self.label = "150629_SN1001_0511_AH5L7GBCXX-test"
+        self.reference_base = ""
+        self.reference_map = ""
+        self.tmp1_path = join(self.output_path, "TellReadJob", "output",
+                              "tmp1")
+        # reflects location of script on host.
+        self.sing_script_path = ("$HOME/qiita-spots/tellread-release-novaseqX/"
+                                 "run_tellread_sing.sh")
+        self.lane = "1"
+        self.cores_per_task = "4"
+
+    def test_creation(self):
+        # test basic good-path
+        job = TellReadJob(self.run_dir, self.output_path,
+                          self.sample_sheet_path, self.queue_name,
+                          self.node_count, self.wall_time_limit,
+                          self.jmem, self.modules_to_load, self.qiita_job_id,
+                          self.reference_base, self.reference_map,
+                          self.sing_script_path, self.cores_per_task)
+
+        job._generate_job_script()
+
+        with open(self.obs, 'r') as f:
+            obs_lines = f.readlines()
+
+        with open(self.exp, 'r') as f:
+            exp_lines = f.readlines()
+
+        for obs_line, exp_line in zip(obs_lines, exp_lines):
+            self.assertEqual(obs_line, exp_line)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/sequence_processing_pipeline/tests/test_commands.py b/sequence_processing_pipeline/tests/test_commands.py
index f58bb176..ac8a4bd9 100644
--- a/sequence_processing_pipeline/tests/test_commands.py
+++ b/sequence_processing_pipeline/tests/test_commands.py
@@ -16,12 +16,12 @@ def test_split_similar_size_bins(self, glob, stat):
         class MockStat:
             st_size = 2 ** 28  # 256MB
 
-        mockglob = ['/foo/bar/a_R1_.fastq.gz',
-                    '/foo/bar/b_R2_.fastq.gz',
-                    '/foo/bar/a_R2_.fastq.gz',
-                    '/foo/baz/c_R2_.fastq.gz',
-                    '/foo/baz/c_R1_.fastq.gz',
-                    '/foo/bar/b_R1_.fastq.gz']
+        mockglob = ['/foo/bar/a_R1_001.fastq.gz',
+                    '/foo/bar/b_R2_001.fastq.gz',
+                    '/foo/bar/a_R2_001.fastq.gz',
+                    '/foo/baz/c_R2_001.fastq.gz',
+                    '/foo/baz/c_R1_001.fastq.gz',
+                    '/foo/bar/b_R1_001.fastq.gz']
 
         with TemporaryDirectory() as tmp:
             exp = (2, 1073741824)
@@ -30,9 +30,12 @@ class MockStat:
             obs = split_similar_size_bins('foo', 1, tmp + '/prefix')
             self.assertEqual(obs, exp)
 
-            exp_1 = ('/foo/bar/a_R1_.fastq.gz\t/foo/bar/a_R2_.fastq.gz\tbar\n'
-                     '/foo/bar/b_R1_.fastq.gz\t/foo/bar/b_R2_.fastq.gz\tbar\n')
-            exp_2 = '/foo/baz/c_R1_.fastq.gz\t/foo/baz/c_R2_.fastq.gz\tbaz\n'
+            exp_1 = ('/foo/bar/a_R1_001.fastq.gz\t/foo/bar/a_R2_001.fastq.gz'
+                     '\tbar\n'
+                     '/foo/bar/b_R1_001.fastq.gz\t/foo/bar/b_R2_001.fastq.gz'
+                     '\tbar\n')
+            exp_2 = ('/foo/baz/c_R1_001.fastq.gz\t/foo/baz/c_R2_001.fastq.gz'
+                     '\tbaz\n')
 
             obs_1 = open(tmp + '/prefix-1').read()
             self.assertEqual(obs_1, exp_1)
@@ -56,10 +59,6 @@ def test_demux(self):
                                      '@2::MUX::bing/2', 'ATGC', '+', '!!!!',
                                      ''])
             infile = io.StringIO(infile_data)
-            exp_data_r1 = '\n'.join(['@baz/1', 'ATGC', '+', '!!!!',
-                                     '@bing/1', 'ATGC', '+', '!!!!', ''])
-            exp_data_r2 = '\n'.join(['@baz/2', 'ATGC', '+', '!!!!',
-                                     '@bing/2', 'ATGC', '+', '!!!!', ''])
 
             exp_data_r1 = ['@baz/1', 'ATGC', '+', '!!!!',
                            '@bing/1', 'ATGC', '+', '!!!!']
diff --git a/sequence_processing_pipeline/tests/test_util.py b/sequence_processing_pipeline/tests/test_util.py
index 136dc9a0..e5073101 100644
--- a/sequence_processing_pipeline/tests/test_util.py
+++ b/sequence_processing_pipeline/tests/test_util.py
@@ -4,24 +4,18 @@
 
 class TestUtil(unittest.TestCase):
     def test_iter_paired_files(self):
-        tests = [(['a_R1_foo',
-                   'b_R2_bar',
-                   'a_R2_baz',
-                   'b_R1_bing'],
-                  [('a_R1_foo', 'a_R2_baz'),
-                   ('b_R1_bing', 'b_R2_bar')]),
-                 (['a.R1.foo',
-                   'b.R2.bar',
-                   'a.R2.baz',
-                   'b.R1.bing'],
-                  [('a.R1.foo', 'a.R2.baz'),
-                   ('b.R1.bing', 'b.R2.bar')]),
-                 (['a.R1.foo',
-                   'b_R2_bar',
-                   'a.R2.baz',
-                   'b_R1_bing'],
-                  [('a.R1.foo', 'a.R2.baz'),
-                   ('b_R1_bing', 'b_R2_bar')])]
+        # tuples of randomly ordered fastq files and thier expected
+        # sorted and organized output from iter_paired_files().
+
+        # underscore filenames updated to require '_001.fastq.gz'.
+        # legacy dot filenames test remains as-is.
+        tests = [(['b_R2_001.fastq.gz', 'a_R1_001.fastq.gz',
+                   'a_R2_001.fastq.gz', 'b_R1_001.fastq.gz'],
+                  [('a_R1_001.fastq.gz', 'a_R2_001.fastq.gz'),
+                   ('b_R1_001.fastq.gz', 'b_R2_001.fastq.gz')]),
+                 (['a.R1.foo', 'b.R2.bar', 'a.R2.baz', 'b.R1.bing'],
+                  [('a.R1.foo', 'a.R2.baz'), ('b.R1.bing', 'b.R2.bar')])]
+
         for files, exp in tests:
             obs = list(iter_paired_files(files))
             self.assertEqual(obs, exp)
@@ -42,7 +36,7 @@ def test_iter_paired_files_bad_pair(self):
             list(iter_paired_files(files))
 
     def test_iter_paired_files_mismatch_prefix(self):
-        files = ['a_R1_foo', 'ab_R2_foo']
+        files = ['a_R1_001.fastq.gz', 'ab_R2_001.fastq.gz']
         with self.assertRaisesRegex(ValueError, "Mismatch prefixes"):
             list(iter_paired_files(files))
 
diff --git a/sequence_processing_pipeline/util.py b/sequence_processing_pipeline/util.py
index d9586f81..e19bf98a 100644
--- a/sequence_processing_pipeline/util.py
+++ b/sequence_processing_pipeline/util.py
@@ -1,7 +1,18 @@
 import re
 
 
-PAIR_UNDERSCORE = (re.compile(r'_R1_'), '_R1_', '_R2_')
+# PAIR_UNDERSCORE = (re.compile(r'_R1_'), '_R1_', '_R2_')
+# The above will truncate on the first _R1_ found, which only works when _R1_
+# or _R2_ appears exactly once in a file path. When the wet-lab incorporates
+# these same strings in their sample-names as descriptive metadata, this
+# assumption is broken. For all raw fastq files being used as input into
+# NuQCJob, we can assume they end in the following convention. Per Illumina
+# spec, all fastq files end in _001 and we preserve this convention even at
+# the cost of renaming output files from TRIntegrateJob.
+# PAIR_DOT is kept as is, but may be removed later because for the purposes of
+# SPP, no input should ever be named with dots instead of underscores.
+PAIR_UNDERSCORE = (re.compile(r'_R1_001.fastq.gz'),
+                   '_R1_001.fastq.gz', '_R2_001.fastq.gz')
 PAIR_DOT = (re.compile(r'\.R1\.'), '.R1.', '.R2.')
 PAIR_TESTS = (PAIR_UNDERSCORE, PAIR_DOT)