diff --git a/README.md b/README.md index 15470b2..459b4de 100644 --- a/README.md +++ b/README.md @@ -17,11 +17,11 @@ You can install Sniffles2 using pip or conda using: or -`conda install sniffles=2.5.2` +`conda install sniffles=2.5.3` If you previously installed Sniffles1 using conda and want to upgrade to Sniffles2, you can use: -`conda update sniffles=2.5.2` +`conda update sniffles=2.5.3` ## Requirements * Python ==3.10.15 diff --git a/setup.cfg b/setup.cfg index 182a813..da01d28 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,6 +1,6 @@ [metadata] name = sniffles -version = 2.5.2 +version = 2.5.3 author = Moritz Smolka, Hermann Romanek author_email = moritz.g.smolka@gmail.com, sniffles@romanek.at description = A fast structural variation caller for long-read sequencing data diff --git a/setup.py b/setup.py index 6fd67de..5a89904 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ setup( name='sniffles', - version='2.5.2', + version='2.5.3', packages=find_packages(), url='https://github.com/fritzsedlazeck/Sniffles', license='MIT', diff --git a/src/sniffles/config.py b/src/sniffles/config.py index 9287f96..be10fbf 100644 --- a/src/sniffles/config.py +++ b/src/sniffles/config.py @@ -21,7 +21,7 @@ from sniffles.region import Region VERSION = "Sniffles2" -BUILD = "2.5" +BUILD = "2.5.3" SNF_VERSION = "S2_rc4" @@ -168,7 +168,26 @@ def add_genotype_args(self, parser): genotype_args.add_argument("--sample-id", type=str, help="Custom ID for this sample, used for later multi-sample calling (stored in .snf)", default=None) genotype_args.add_argument("--genotype-vcf", metavar="IN.vcf", type=str, help="Determine the genotypes for all SVs in the given input .vcf file (forced calling). Re-genotyped .vcf will be written to the output file specified with --vcf.", default=None) - def add_multi_args(self, parser): + combine_high_confidence: float + combine_low_confidence: float + combine_low_confidence_abs: int + combine_null_min_coverage: int + combine_match: int + combine_match_max: int + combine_separate_intra: bool + combine_output_filtered: bool + combine_pair_relabel: bool + combine_pair_relabel_threshold: int + combine_close_handles: bool + combine_pctseq: float + combine_max_inmemory_results: int + combine_support_threshold: int + + @classmethod + def add_multi_args(cls, parser): + """ + Arguments for multi-sample calling + """ multi_args = parser.add_argument_group("Multi-Sample Calling / Combine parameters") multi_args.add_argument("--combine-high-confidence", metavar="F", type=float, help="Minimum fraction of samples in which a SV needs to have individually passed QC for it to be reported in combined output (a value of zero will report all SVs that pass QC in at least one of the input samples)", default=0.0) multi_args.add_argument("--combine-low-confidence", metavar="F", type=float, help="Minimum fraction of samples in which a SV needs to be present (failed QC) for it to be reported in combined output", default=0.2) @@ -183,6 +202,9 @@ def add_multi_args(self, parser): multi_args.add_argument("--combine-close-handles", help="Close .SNF file handles after each use. May lower performance, but may be required when maximum number of file handles supported by OS is reached when merging many samples.", default=False, action="store_true") multi_args.add_argument("--combine-pctseq", default=0.7, type=float, help="Minimum alignment distance as percent of SV length to be merged. Set to 0 to disable alignments for merging.") multi_args.add_argument("--combine-max-inmemory-results", default=20, type=int, help=argparse.SUPPRESS) + multi_args.add_argument("--combine-support-threshold", default=3, metavar="N", type=int, help="Minimum support for SVs to be considered for multi-sample calling.") + multi_args.add_argument("--re-qc", metavar="auto", default="auto", type=str, help="Re-QC SVs from SNF files. Set to 0 to disable re-qc of SNF files. Set to 1 to force re-qc. Default of 'auto' will try to fix known errors in SNF files.") + # multi_args.add_argument("--combine-exhaustive", help="(DEV) Disable performance optimization in multi-calling", default=False, action="store_true") # multi_args.add_argument("--combine-relabel-rare", help="(DEV)", default=False, action="store_true") # multi_args.add_argument("--combine-with-missing", help="(DEV)", default=False, action="store_true") @@ -326,6 +348,13 @@ def __init__(self, *args, **kwargs): if self.dev_no_qc: self.no_qc = True + if self.re_qc == 'auto': + self.reqc = 'auto' + elif self.re_qc in ('0', '1'): + self.reqc = bool(int(self.re_qc)) + else: + util.fatal_error('Invalid value for --re-qc, allowed values are: auto, 0, 1') + if not hasattr(self, 'mapq'): self.mapq = 0 if self.dev_no_qc else 20 if not hasattr(self, 'min_alignment_length'): diff --git a/src/sniffles/genotyping.py b/src/sniffles/genotyping.py index b5773a3..a407dbd 100644 --- a/src/sniffles/genotyping.py +++ b/src/sniffles/genotyping.py @@ -48,10 +48,19 @@ class Genotyper: _support: int _coverage: float - def __init__(self, svcall: SVCall, config, phase): + def __init__(self, svcall: SVCall, config, phase: tuple | None): self.svcall = svcall self.config = config - self.phase = phase + self.phase = phase if phase is not None else self._get_phase() + + def _get_phase(self) -> tuple | None: + """ + Get phasing information from the genotyped SV, or None if it can't be determined. + """ + try: + return self.svcall.genotypes[0][5] + except (KeyError, IndexError): + return None def _calculate_support(self) -> int: return self.svcall.support diff --git a/src/sniffles/parallel.py b/src/sniffles/parallel.py index 9b9736b..d00f30b 100644 --- a/src/sniffles/parallel.py +++ b/src/sniffles/parallel.py @@ -114,7 +114,7 @@ def call_candidates(self, keep_qc_fails, config): self.coverage_average_total = self.coverage_average_fwd + self.coverage_average_rev return candidates - def finalize_candidates(self, candidates, keep_qc_fails, config): + def finalize_candidates(self, candidates: list['SVCall'], keep_qc_fails, config): passed = [] for svcall in candidates: svcall.qc = svcall.qc and postprocessing.qc_sv(svcall, config) @@ -350,6 +350,7 @@ def execute(self, worker: 'SnifflesWorker' = None): bin_min_size = self.config.combine_min_size bin_max_candidates = max(25, int(len(self.config.snf_input_info) * 0.5)) overlap_abs = self.config.combine_overlap_abs + support_threshold = self.config.combine_support_threshold sample_internal_ids = set(samples_headers_snf.keys()) @@ -375,15 +376,21 @@ def execute(self, worker: 'SnifflesWorker' = None): for svtype in sv.TYPES: bins = {} - # svcandidates=[] - for sample_internal_id in samples_headers_snf.keys(): # fetch current block for each file + for sample_internal_id, sample_snf in samples_headers_snf.items(): # fetch current block for each file blocks = samples_blocks[sample_internal_id] + reqc = sample_snf.reqc + if blocks is None: continue for block in blocks: # usually only 1 block for cand in block[svtype]: # if config.combine_pass_only and (cand.qc==False or cand.filter!="PASS"): # continue + if cand.support < support_threshold: + continue + + if reqc: + postprocessing.genotype_sv(cand, self.config) cand.sample_internal_id = sample_internal_id diff --git a/src/sniffles/postprocessing.py b/src/sniffles/postprocessing.py index 5dc66c7..23eb2dd 100644 --- a/src/sniffles/postprocessing.py +++ b/src/sniffles/postprocessing.py @@ -310,7 +310,7 @@ def qc_sv(svcall: SVCall, config: SnifflesConfig): elif svcall.coverage_upstream < svcall.coverage_downstream: if (upstream_downstream_diff > svcall.coverage_upstream / svcall.coverage_downstream or svcall.coverage_upstream > svcall.coverage_center): - svcall.filter = "COV_CHANGE_DEL" + svcall.filter = "COV_CHANGE_DUP" return False else: pass @@ -402,7 +402,7 @@ def binomial_coef(n, k): return math.factorial(n) / (math.factorial(k) * math.factorial(n - k)) -def genotype_sv(svcall: SVCall, config, phase): +def genotype_sv(svcall: SVCall, config, phase: tuple | None = None): from sniffles.genotyping import GENOTYPER_BY_TYPE, Genotyper GENOTYPER_BY_TYPE.get(svcall.svtype, Genotyper)(svcall, config, phase).calculate() diff --git a/src/sniffles/snf.py b/src/sniffles/snf.py index ff698e6..d3c2755 100644 --- a/src/sniffles/snf.py +++ b/src/sniffles/snf.py @@ -8,18 +8,22 @@ # Maintainer: Hermann Romanek # Contact: sniffles@romanek.at # +import logging import os import pickle import json import gzip import math -from collections import OrderedDict -from typing import Optional, Union +from functools import cached_property +from typing import Optional from sniffles import sv from sniffles.config import SnifflesConfig +log = logging.getLogger(__name__) + + class SNFile: header_length: int _header: Optional[dict] @@ -42,6 +46,22 @@ def index(self) -> dict: def header(self) -> dict: return self._header + @cached_property + def reqc(self) -> bool: + """ + Was this file created by an old Version of sniffles we want to redo qc? + """ + if self.config.reqc == 'auto': + try: + build, _, _ = self.header['config']['build'].partition('-') + except (KeyError, AttributeError): + log.warning(f'Unable to determine version of SNF file {self.filename} for auto-reqc') + return True + else: + return build < '2.5.3' + else: + return self.config.reqc + def is_open(self) -> bool: return self.handle is not False diff --git a/src/sniffles/sniffles b/src/sniffles/sniffles index 0bc336b..86bd2d3 100755 --- a/src/sniffles/sniffles +++ b/src/sniffles/sniffles @@ -362,6 +362,7 @@ def Sniffles2_Main(processes: list[parallel.SnifflesWorker]): else: util.fatal_error_main("Failed to determine .snf files to be combined. Please specify either one or more .snf files OR a single .tsv file as input for multi-calling.") + log.info("The following samples will be processed in multi-calling:") for snf_internal_id, (input_filename, sample_id) in enumerate(input_snfs_sample_ids): snf_in = snf.SNFile(config, open(input_filename, "rb"), filename=input_filename) snf_in.read_header() @@ -378,7 +379,9 @@ def Sniffles2_Main(processes: list[parallel.SnifflesWorker]): else: sample_id, _ = os.path.splitext(os.path.basename(input_filename)) config.snf_input_info.append({"internal_id": snf_internal_id, "sample_id": sample_id, "filename": input_filename}) + reqc_info = f' (Rerunning QC)' if snf_in.reqc else '' snf_in.close() + log.info(f" {input_filename} (sample ID in output VCF='{sample_id}'{reqc_info})") if not config.combine_consensus: for info in config.snf_input_info: @@ -422,9 +425,6 @@ def Sniffles2_Main(processes: list[parallel.SnifflesWorker]): task_id = tasks[-1].id + 1 log.info(f"Verified headers for {len(input_snfs_sample_ids)} .snf files.") - log.info("The following samples will be processed in multi-calling:") - for info in config.snf_input_info: - log.info(f" {info['filename']} (sample ID in output VCF='{info['sample_id']}')") if config.mode != "genotype_vcf" and config.vcf is not None: vcf_out.write_header(contig_lengths) diff --git a/src/sniffles/sv.py b/src/sniffles/sv.py index 00a4906..99e1e92 100755 --- a/src/sniffles/sv.py +++ b/src/sniffles/sv.py @@ -203,13 +203,13 @@ def call(self, config, task) -> Optional[SVCall]: total_count >= config.combine_low_confidence_abs)) if not qc: - if not config.no_qc and n_samples == 1: + if config.no_qc and n_samples == 1: pass else: return None if not config.combine_output_filtered and not any(cand.qc and cand.filter == "PASS" for cand in self.candidates): - if not config.no_qc and n_samples == 1: + if config.no_qc and n_samples == 1: pass else: return None @@ -263,7 +263,7 @@ def call(self, config, task) -> Optional[SVCall]: genotypes = {0: (cons_a, cons_b, int(sum(consensus_info["qual"]) / consensus_info["count"]), sum(consensus_info["dr"]), sum(consensus_info["dv"]))} if cons_a != 1 and cons_b != 1: - if not config.no_qc and n_samples == 1: + if config.no_qc and n_samples == 1: pass else: return None