Skip to content

Commit

Permalink
BUG: demux min length (qiime2#32)
Browse files Browse the repository at this point in the history
  • Loading branch information
Oddant1 authored and thermokarst committed May 16, 2019
1 parent b9879ee commit 70084a9
Show file tree
Hide file tree
Showing 3 changed files with 62 additions and 23 deletions.
19 changes: 12 additions & 7 deletions q2_cutadapt/_demux.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,10 +37,11 @@ def run_command(cmd, verbose=True):


def _build_demux_command(seqs_dir_fmt, barcode_fhs, per_sample_dir_fmt,
untrimmed_dir_fmt, error_rate):
untrimmed_dir_fmt, error_rate, minimum_length):
cmd = ['cutadapt',
'--front', 'file:%s' % barcode_fhs['fwd'].name,
'--error-rate', str(error_rate),
'--minimum-length', str(minimum_length),
# {name} is a cutadapt convention for interpolating the sample id
# into the filename.
'-o', os.path.join(str(per_sample_dir_fmt), '{name}.1.fastq.gz'),
Expand Down Expand Up @@ -106,7 +107,7 @@ def _write_empty_fastq_to_mux_barcode_in_seq_fmt(seqs_dir_fmt):


def _demux(seqs, forward_barcodes, reverse_barcodes, error_tolerance,
mux_fmt, batch_size):
mux_fmt, batch_size, minimum_length):
fwd_barcode_name = forward_barcodes.name
forward_barcodes = forward_barcodes.drop_missing_values()
barcodes = forward_barcodes.to_series().to_frame()
Expand Down Expand Up @@ -156,7 +157,8 @@ def _demux(seqs, forward_barcodes, reverse_barcodes, error_tolerance,
open_fhs['rev'])
cmd = _build_demux_command(previous_untrimmed, open_fhs,
per_sample_sequences,
current_untrimmed, error_tolerance)
current_untrimmed, error_tolerance,
minimum_length)
run_command(cmd)
open_fhs['fwd'].close()
if reverse_barcodes is not None:
Expand All @@ -174,20 +176,23 @@ def _demux(seqs, forward_barcodes, reverse_barcodes, error_tolerance,
def demux_single(seqs: MultiplexedSingleEndBarcodeInSequenceDirFmt,
barcodes: qiime2.CategoricalMetadataColumn,
error_rate: float = 0.1,
batch_size: int = 0) -> \
batch_size: int = 0,
minimum_length: int = 1) -> \
(CasavaOneEightSingleLanePerSampleDirFmt,
MultiplexedSingleEndBarcodeInSequenceDirFmt):
mux_fmt = MultiplexedSingleEndBarcodeInSequenceDirFmt
return _demux(seqs, barcodes, None, error_rate, mux_fmt, batch_size)
return _demux(seqs, barcodes, None, error_rate, mux_fmt, batch_size,
minimum_length)


def demux_paired(seqs: MultiplexedPairedEndBarcodeInSequenceDirFmt,
forward_barcodes: qiime2.CategoricalMetadataColumn,
reverse_barcodes: qiime2.CategoricalMetadataColumn = None,
error_rate: float = 0.1,
batch_size: int = 0) -> \
batch_size: int = 0,
minimum_length: int = 1) -> \
(CasavaOneEightSingleLanePerSampleDirFmt,
MultiplexedPairedEndBarcodeInSequenceDirFmt):
mux_fmt = MultiplexedPairedEndBarcodeInSequenceDirFmt
return _demux(seqs, forward_barcodes, reverse_barcodes, error_rate,
mux_fmt, batch_size)
mux_fmt, batch_size, minimum_length)
14 changes: 12 additions & 2 deletions q2_cutadapt/plugin_setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -232,6 +232,7 @@
'error_rate': Float % Range(0, 1, inclusive_start=True,
inclusive_end=True),
'batch_size': Int % Range(0, None),
'minimum_length': Int % Range(1, None),
},
outputs=[
('per_sample_sequences', SampleData[SequencesWithQuality]),
Expand All @@ -251,7 +252,11 @@
'concurrently. Demultiplexing in smaller batches will '
'yield the same result with marginal speed loss, and '
'may solve "too many files" errors related to sample '
'quantity. Set to "0" to process all samples at once.'
'quantity. Set to "0" to process all samples at once.',
'minimum_length': 'Discard reads shorter than specified value. Note, '
'the cutadapt default of 0 has been overridden, '
'because that value produces empty sequence '
'records.',
},
output_descriptions={
'per_sample_sequences': 'The resulting demultiplexed sequences.',
Expand All @@ -276,6 +281,7 @@
'error_rate': Float % Range(0, 1, inclusive_start=True,
inclusive_end=True),
'batch_size': Int % Range(0, None),
'minimum_length': Int % Range(1, None),
},
outputs=[
('per_sample_sequences', SampleData[PairedEndSequencesWithQuality]),
Expand All @@ -295,7 +301,11 @@
'concurrently. Demultiplexing in smaller batches will '
'yield the same result with marginal speed loss, and '
'may solve "too many files" errors related to sample '
'quantity. Set to "0" to process all samples at once.'
'quantity. Set to "0" to process all samples at once.',
'minimum_length': 'Discard reads shorter than specified value. Note, '
'the cutadapt default of 0 has been overridden, '
'because that value produces empty sequence '
'records.',
},
output_descriptions={
'per_sample_sequences': 'The resulting demultiplexed sequences.',
Expand Down
52 changes: 38 additions & 14 deletions q2_cutadapt/tests/test_demux.py
Original file line number Diff line number Diff line change
Expand Up @@ -200,6 +200,25 @@ def test_batch_size_odd_number_of_samples(self):
# obs_untrimmed should be empty, since everything matched
self.assert_untrimmed_results(b'', obs_untrimmed_art)

def test_min_length(self):
metadata = CategoricalMetadataColumn(
# The third barcode is meant to completely remove the only GGGG
# coded sequence
pd.Series(['AAAA', 'CCCC', 'GGGGACGTACGT'], name='Barcode',
index=pd.Index(['sample_a', 'sample_b', 'sample_c'],
name='id')))

with redirected_stdio(stderr=os.devnull):
obs_demuxed_art, obs_untrimmed_art = \
self.demux_single_fn(self.muxed_sequences, metadata)

obs = obs_demuxed_art.view(SingleLanePerSampleSingleEndFastqDirFmt)

(obs_f1, _), (obs_f2, _) = obs.sequences.iter_views(FastqGzFormat)

self.assertEqual('sample_a_AAAA_L001_R1_001.fastq.gz', str(obs_f1))
self.assertEqual('sample_b_CCCC_L001_R1_001.fastq.gz', str(obs_f2))


class TestDemuxPaired(TestPluginBase):
package = 'q2_cutadapt.tests'
Expand Down Expand Up @@ -330,13 +349,15 @@ def test_build_demux_command(self):
{'fwd': barcode_fasta, 'rev': None},
self.per_sample_dir_fmt,
self.untrimmed_dir_fmt,
0.1)
0.1,
2)
self.assertTrue(barcode_fasta.name in obs[2])
self.assertTrue('0.1' in obs[4])
self.assertTrue(str(self.per_sample_dir_fmt) in obs[6])
self.assertTrue(str(self.untrimmed_dir_fmt) in obs[8])
self.assertTrue('2' in obs[6])
self.assertTrue(str(self.per_sample_dir_fmt) in obs[8])
self.assertTrue(str(self.untrimmed_dir_fmt) in obs[10])
self.assertEqual(str(self.seqs_dir_fmt.file.view(FastqGzFormat)),
obs[9])
obs[11])

def test_rename_files_single(self):
for fn in ['sample_a.fastq.gz', 'sample_b.fastq.gz']:
Expand Down Expand Up @@ -410,17 +431,19 @@ def test_build_demux_command(self):
{'fwd': barcode_fasta, 'rev': None},
self.per_sample_dir_fmt,
self.untrimmed_dir_fmt,
0.1)
0.1,
2)
self.assertTrue(barcode_fasta.name in obs[2])
self.assertTrue('0.1' in obs[4])
self.assertTrue(str(self.per_sample_dir_fmt) in obs[6]) # fwd
self.assertTrue(str(self.per_sample_dir_fmt) in obs[10]) # rev
self.assertTrue(str(self.untrimmed_dir_fmt) in obs[8]) # fwd
self.assertTrue(str(self.untrimmed_dir_fmt) in obs[12]) # rev
self.assertTrue('2' in obs[6])
self.assertTrue(str(self.per_sample_dir_fmt) in obs[8]) # fwd
self.assertTrue(str(self.per_sample_dir_fmt) in obs[12]) # rev
self.assertTrue(str(self.untrimmed_dir_fmt) in obs[10]) # fwd
self.assertTrue(str(self.untrimmed_dir_fmt) in obs[14]) # rev
exp_f = str(self.seqs_dir_fmt.forward_sequences.view(FastqGzFormat))
self.assertEqual(exp_f, obs[13])
self.assertEqual(exp_f, obs[15])
exp_r = str(self.seqs_dir_fmt.reverse_sequences.view(FastqGzFormat))
self.assertEqual(exp_r, obs[14])
self.assertEqual(exp_r, obs[16])

def test_build_di_demux_command(self):
with tempfile.NamedTemporaryFile() as barcode_fasta_f:
Expand All @@ -430,10 +453,11 @@ def test_build_di_demux_command(self):
'rev': barcode_fasta_r},
self.per_sample_dir_fmt,
self.untrimmed_dir_fmt,
0.1)
0.1,
2)
self.assertTrue(barcode_fasta_f.name in obs[2])
self.assertTrue('--pair-adapters' == obs[9])
self.assertTrue(barcode_fasta_r.name in obs[11])
self.assertTrue('--pair-adapters' == obs[11])
self.assertTrue(barcode_fasta_r.name in obs[13])

def test_rename_files(self):
for fn in ['sample_a.1.fastq.gz', 'sample_a.2.fastq.gz',
Expand Down

0 comments on commit 70084a9

Please sign in to comment.