From 520e0b5596dfb08e148a76d232930394930af088 Mon Sep 17 00:00:00 2001 From: khyox Date: Wed, 29 Nov 2023 19:00:59 -0800 Subject: [PATCH] Add refafilt script for filtering fasta files by sequence length On branch dev modified: README.md modified: recentrifuge/__init__.py new file: refafilt modified: setup.py --- README.md | 2 +- recentrifuge/__init__.py | 4 +- refafilt | 326 +++++++++++++++++++++++++++++++++++++++ setup.py | 12 +- 4 files changed, 338 insertions(+), 6 deletions(-) create mode 100755 refafilt diff --git a/README.md b/README.md index a723d7b..92fb556 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,7 @@ ____ -[![Retest](https://github.com/khyox/Recentrifuge/actions/workflows/retest.yaml/badge.svg?branch=v1.12.1)](https://github.com/khyox/recentrifuge/actions/workflows/retest.yaml) +[![Retest](https://github.com/khyox/Recentrifuge/actions/workflows/retest.yaml/badge.svg?branch=v1.13.0)](https://github.com/khyox/recentrifuge/actions/workflows/retest.yaml) [![](https://img.shields.io/maintenance/yes/2023.svg)](http://www.recentrifuge.org) [![](https://img.shields.io/github/languages/top/khyox/recentrifuge.svg)](https://pypi.org/project/recentrifuge/) [![](https://img.shields.io/pypi/pyversions/recentrifuge.svg)](https://pypi.org/project/recentrifuge/) diff --git a/recentrifuge/__init__.py b/recentrifuge/__init__.py index 33bf2b1..23f54b3 100644 --- a/recentrifuge/__init__.py +++ b/recentrifuge/__init__.py @@ -33,8 +33,8 @@ __email__ = 'jse.mnl **AT** gmail.com' __maintainer__ = 'Jose Manuel Marti' __status__ = 'Production/Stable' -__date__ = 'Apr 2023' -__version__ = '1.12.1' +__date__ = 'Nov 2023' +__version__ = '1.13.0' import sys from Bio import SeqIO diff --git a/refafilt b/refafilt new file mode 100755 index 0000000..2c7c2a7 --- /dev/null +++ b/refafilt @@ -0,0 +1,326 @@ +#!/usr/bin/env python3 +# +# Copyright (C) 2017–2023, Jose Manuel Martí Martínez +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . +# +""" +Filter a FASTA file attending to the length of the sequences. +""" +# pylint: disable=no-name-in-module, not-an-iterable +import argparse +import collections +import contextlib +import gzip +import pathlib +import sys +import time +from typing import Callable, Dict, TextIO, Counter, TypeAlias + +from Bio.SeqIO.FastaIO import SimpleFastaParser +import matplotlib.pyplot as plt +import numpy as np + +from recentrifuge import __version__, __author__, __date__ +from recentrifuge.config import LICENSE, GZEXT +from recentrifuge.config import gray, red, green, cyan, magenta, blue, yellow + +Filename: TypeAlias = pathlib.Path + +def main(): + """Main entry point to script.""" + + def vprint(*arguments) -> None: + """Print only if verbose/debug mode is enabled""" + if args.debug: + print(*arguments) + + def configure_parser(): + """Argument Parser Configuration""" + parser = argparse.ArgumentParser( + description='Filter a FASTA file by sequence length', + epilog=f'%(prog)s - Release {__version__} - {__date__}' + LICENSE, + formatter_class=argparse.RawDescriptionHelpFormatter + ) + parser.add_argument( + '-d', '--debug', + action='store_true', + help='increase output verbosity and perform additional checks' + ) + parser.add_argument( + '-i', '--input', + action='store', + metavar='FILE', + type=Filename, + default=None, + required=True, + help='input FASTA file, which may be gzipped' + ) + parser.add_argument( + '-o', '--output_pass', + action='store', + metavar='FILE', + type=Filename, + default=None, + required=False, + help='output file for sequences passing the filter(s)' + ) + parser.add_argument( + '-s', '--output_short', + action='store', + metavar='FILE', + type=Filename, + default=None, + required=False, + help='output file for sequences too short' + ) + parser.add_argument( + '-l', '--output_long', + action='store', + metavar='FILE', + type=Filename, + default=None, + required=False, + help='output file for sequences too long' + ) + parser.add_argument( + '-p', '--output_plot', + action='store', + metavar='FILE', + type=Filename, + default=None, + required=False, + help='output PDF file for histogram plot' + ) + parser.add_argument( + '-f', '--filter_less_than', + action='store', + metavar='NUMBER', + type=int, + default=None, + help='minimum length of the accepted sequences' + ) + parser.add_argument( + '-F', '--filter_more_than', + action='store', + metavar='NUMBER', + type=int, + default=None, + help='maximum length of the accepted sequences' + ) + parser.add_argument( + '-m', '--maxreads', + action='store', + metavar='NUMBER', + type=int, + default=None, + help=('maximum number of FASTA reads to process; ' + 'default: no maximum') + ) + parser.add_argument( + '-c', '--compress', + action='store_true', + default=False, + help='the resulting FASTA files will be gzipped' + ) + parser.add_argument( + '-V', '--version', + action='version', + version=f'%(prog)s release {__version__} ({__date__})' + ) + return parser + + def check_debug(): + """Check debugging mode""" + if args.debug: + print(blue('INFO:'), gray('Debugging mode activated')) + print(blue('INFO:'), gray('Active parameters:')) + for key, val in vars(args).items(): + if val is not None and val is not False and val != []: + print(gray(f'\t{key} ='), f'{val}') + + # timing initialization + start_time: float = time.time() + # Program header + print(f'\n=-= {sys.argv[0]} =-= v{__version__} - {__date__}' + f' =-= by {__author__} =-=\n') + sys.stdout.flush() + + # Parse arguments and check debug + argparser = configure_parser() + args = argparser.parse_args() + check_debug() + compr = args.compress + in_fasta: Filename = Filename(args.input) + output_pass: Filename = args.output_pass + if output_pass is None: + output_pass = Filename(f'{in_fasta.stem}_passed.fa') + vprint(blue('INFO:'), + gray('Passing filter output file: '), f'{output_pass}') + output_plot: Filename = args.output_plot + if output_plot is None: + output_plot = Filename(f'{in_fasta.stem}_hist.pdf') + + min_filt: bool + if args.filter_less_than is None: + min_filt: bool = False + else: + min_filt: bool = True + min_len: int = args.filter_less_than + output_short: Filename = args.output_short + if output_short is None: + output_short = Filename(f'{in_fasta.stem}_tooshort{min_len}.fa') + vprint(blue('INFO:'), + gray('Too short filter output: '), f'{output_short}') + + max_filt: bool + if args.filter_more_than is None: + max_filt: bool = False + else: + max_filt: bool = True + max_len: int = args.filter_more_than + output_long: Filename = args.output_long + if output_long is None: + output_long = Filename(f'{in_fasta.stem}_toolong{max_len}nt.fa') + vprint(blue('INFO:'), + gray('Too long filter output: '), f'{output_long}') + + def is_gzipped(fpath: Filename): + """Check if a file exists and is gzipped""" + try: + with open(fpath, 'rb') as ftest: + return ftest.read(2) == b'\x1f\x8b' + except FileNotFoundError: + return False + + # Prepare output + handler: Callable[..., TextIO] + fext: Filename = Filename('.fa') + if compr: + handler = gzip.open + fext += GZEXT + else: + handler = open + stats: Counter[int] = collections.Counter() + + # Main loop: filter FASTA file + i: int = 0 + pass_lens: dict[int] = [] + tiny_lens: dict[int] = [] + long_lens: dict[int] = [] + print(gray('Filtering FASTA file'), f'{in_fasta}', gray('...\nMseqs: '), + end='') + sys.stdout.flush() + if is_gzipped(in_fasta): + handler = gzip.open + else: + handler = open + with (handler(in_fasta, 'rt') as fa, + open(output_pass, 'w') as pass_file, + open(output_short, 'w') if min_filt else contextlib.nullcontext( + None) as short_file, + open(output_long, 'w') if max_filt else contextlib.nullcontext( + None) as long_file + ): + for i, (title, seq) in enumerate(SimpleFastaParser(fa) + ): + if args.maxreads and i >= args.maxreads: + print(yellow(' [stopping by maxreads limit]!'), end='') + break + elif not i % 1000000: + print(f'{i // 1000000:_d}', end='') + sys.stdout.flush() + elif not i % 100000: + print('.', end='') + sys.stdout.flush() + seq_len : int = len(seq) + if min_filt and seq_len < min_len: + short_file.write(f'>{title}\n') + short_file.write(seq + '\n') + stats['seq_tiny'] += 1 + stats['nt_tiny'] += seq_len + tiny_lens.append(seq_len) + elif max_filt and seq_len > max_len: + long_file.write(f'>{title}\n') + long_file.write(seq + '\n') + stats['seq_long'] += 1 + stats['nt_long'] += seq_len + long_lens.append(seq_len) + else: + pass_file.write(f'>{title}\n') + pass_file.write(seq + '\n') + stats['seq_pass'] += 1 + stats['nt_pass'] += seq_len + pass_lens.append(seq_len) + + # Final statistics + print(cyan(f' {i / 1e+6:.3g} Mseqs'), green('OK! ')) + print(gray('Passed'), magenta(f'{stats["nt_pass"] / 1e+6:.3g}'), + gray('Mnucs in'), stats['seq_pass'], gray('sequences '), + magenta(f'({stats["seq_passed"]/i:.3%}).')) + pass_lens_np = np.array(pass_lens) + if pass_lens: + print(gray('MIN length: '), f'{np.min(pass_lens_np):n}') + print(gray('AVG length: '), f'{np.average(pass_lens_np):.2g}') + print(gray('MAX length: '), f'{np.max(pass_lens_np, initial=0):n}') + print('') + + tiny_lens_np = np.array(tiny_lens) + if min_filt: + print(gray('Too short'), magenta(f'{stats["nt_tiny"] / 1e+3:.3g}'), + gray('Knucs in'), stats['seq_tiny'], gray('sequences '), + magenta(f'({stats["seq_tiny"]/i:.3%}).')) + if tiny_lens: + print(gray('Too short MIN length: '), f'{np.min(tiny_lens_np):n}') + print(gray('Too short AVG length: '), + f'{np.average(tiny_lens_np):.2g}') + print(gray('Too short MAX length: '), f'{np.max(tiny_lens_np):n}') + print('') + + long_lens_np = np.array(long_lens) + if max_filt: + print(gray('Too long'), magenta(f'{stats["nt_long"] / 1e+6:.3g}'), + gray('Mnucs in'), stats['seq_long'], gray('sequences '), + magenta(f'({stats["seq_long"]/i:.3%}).')) + if long_lens: + print(gray('Too long MIN length: '), f'{np.min(long_lens_np):n}') + print(gray('Too long AVG length: '), + f'{np.average(long_lens_np):.2g}') + print(gray('Too long MAX length: '), f'{np.max(long_lens_np):n}') + print('') + + # Histogram + lens_np: np.ndarray = np.concatenate( + (pass_lens_np, tiny_lens_np, long_lens_np)) + bins = np.logspace(np.floor(np.log10(np.min(lens_np))), + np.ceil(np.log10(np.max(lens_np))), 100) + plt.hist(lens_np, bins=bins) + plt.title("Log histogram of reads by length in nucleotides") + plt.yscale("log") + plt.ylabel("Number of reads per bin") + plt.xscale("log") + plt.xlabel("Length of the reads in nucleotides (100 bins)") + if min_filt: + plt.axvline(x=min_len, color='r') + if max_filt: + plt.axvline(x=max_len, color='r') + plt.savefig(output_plot) + + # Timing results + print(gray('Total elapsed time:'), time.strftime( + "%H:%M:%S", time.gmtime(time.time() - start_time))) + + +if __name__ == '__main__': + main() diff --git a/setup.py b/setup.py index ed0cb72..7702edf 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ setup( name='recentrifuge', - version='1.12.1', + version='1.13.0', packages=['recentrifuge'], url='http://www.recentrifuge.org', license='AGPL except krona.js, with its own license by BNBI', @@ -12,7 +12,7 @@ long_description=""" **Robust comparative analysis and contamination removal for metagenomics** -[![Retest](https://github.com/khyox/Recentrifuge/actions/workflows/retest.yaml/badge.svg?branch=v1.12.1)](https://github.com/khyox/recentrifuge/actions/workflows/retest.yaml) +[![Retest](https://github.com/khyox/Recentrifuge/actions/workflows/retest.yaml/badge.svg?branch=v1.13.0)](https://github.com/khyox/recentrifuge/actions/workflows/retest.yaml) With Recentrifuge, researchers can interactively explore what organisms are in their samples and at which level of confidence, thus enabling a robust comparative analysis of multiple samples in any metagenomic study. @@ -61,7 +61,13 @@ }, scripts=['rcf', 'rextract', 'retaxdump', 'remock', 'retest', 'refasplit', 'rextraccnt'], - install_requires=['biopython'], + install_requires=[ + 'biopython==1.79', + 'numpy>=1.19.5', + 'openpyxl==3.1.2', + 'matplotlib>=3.3.4', + 'pandas<2.0.0', + ], python_requires='>=3.6', include_package_data=True, package_data={