From 31e50f5197aa2f2bbef6eb82c3f8564eddbfdbce Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= Date: Thu, 28 Mar 2024 16:56:50 +0100 Subject: [PATCH 01/25] Add option to write proteins sequences from genes in pangenome --- ppanggolin/cluster/cluster.py | 14 +---- ppanggolin/formats/writeSequences.py | 86 ++++++++++++++++++++++++---- 2 files changed, 78 insertions(+), 22 deletions(-) diff --git a/ppanggolin/cluster/cluster.py b/ppanggolin/cluster/cluster.py index 14418e2a..9ce34b2a 100644 --- a/ppanggolin/cluster/cluster.py +++ b/ppanggolin/cluster/cluster.py @@ -23,7 +23,7 @@ from ppanggolin.utils import read_compressed_or_not, restricted_float from ppanggolin.formats.writeBinaries import write_pangenome, erase_pangenome from ppanggolin.formats.readBinaries import check_pangenome_info, get_gene_sequences_from_file -from ppanggolin.formats.writeSequences import write_gene_sequences_from_annotations +from ppanggolin.formats.writeSequences import write_gene_sequences_from_annotations, translate_genes from ppanggolin.utils import mk_outdir @@ -68,7 +68,6 @@ def check_pangenome_for_clustering(pangenome: Pangenome, tmp_file: TextIO, force "or provide a way to access the gene sequence during the annotation step " "(having the fasta in the gff files, or providing the fasta files through the --fasta option)") - def first_clustering(sequences: TextIO, tmpdir: Path, cpu: int = 1, code: int = 11, coverage: float = 0.8, identity: float = 0.8, mode: int = 1) -> Tuple[Path, Path]: """ @@ -84,16 +83,7 @@ def first_clustering(sequences: TextIO, tmpdir: Path, cpu: int = 1, code: int = :return: path to representative sequence file and path to tsv clustering result """ - seq_nucdb = tmpdir / 'nucleotid_sequences_db' - cmd = list(map(str, ["mmseqs", "createdb", sequences.name, seq_nucdb])) - logging.getLogger("PPanGGOLiN").debug(" ".join(cmd)) - logging.getLogger("PPanGGOLiN").info("Creating sequence database...") - subprocess.run(cmd, stdout=subprocess.DEVNULL, check=True) - logging.getLogger("PPanGGOLiN").debug("Translate sequence ...") - seqdb = tmpdir / 'aa_db' - cmd = list(map(str, ["mmseqs", "translatenucs", seq_nucdb, seqdb, "--threads", cpu, "--translation-table", code])) - logging.getLogger("PPanGGOLiN").debug(" ".join(cmd)) - subprocess.run(cmd, stdout=subprocess.DEVNULL, check=True) + seqdb = translate_genes(sequences, tmpdir, cpu, code) logging.getLogger("PPanGGOLiN").info("Clustering sequences...") cludb = tmpdir / 'cluster_db' cmd = list(map(str, ["mmseqs", "cluster", seqdb, cludb, tmpdir, "--cluster-mode", mode, "--min-seq-id", diff --git a/ppanggolin/formats/writeSequences.py b/ppanggolin/formats/writeSequences.py index 900e3496..63a78231 100644 --- a/ppanggolin/formats/writeSequences.py +++ b/ppanggolin/formats/writeSequences.py @@ -5,8 +5,11 @@ import argparse import logging import re +import subprocess from pathlib import Path from typing import TextIO, Dict, Set, Iterable +import tempfile +import shutil # installed libraries from tqdm import tqdm @@ -18,6 +21,7 @@ from ppanggolin.utils import write_compressed_or_not, mk_outdir, read_compressed_or_not, restricted_float, detect_filetype from ppanggolin.formats.readBinaries import check_pangenome_info, get_gene_sequences_from_file + module_regex = re.compile(r'^module_\d+') #\d == [0-9] poss_values = ['all', 'persistent', 'shell', 'cloud', 'rgp', 'softcore', 'core', module_regex] poss_values_log = f"Possible values are {', '.join(poss_values[:-1])}, module_X with X being a module id." @@ -43,6 +47,20 @@ def write_gene_sequences_from_annotations(genes_to_write: Iterable[Gene], file_o file_obj.flush() +def translate_genes(sequences: TextIO, tmpdir: Path, cpu: int = 1, code: int = 11) -> Path: + seq_nucdb = tmpdir / 'nucleotid_sequences_db' + cmd = list(map(str, ["mmseqs", "createdb", sequences.name, seq_nucdb])) + logging.getLogger("PPanGGOLiN").debug(" ".join(cmd)) + logging.getLogger("PPanGGOLiN").info("Creating sequence database...") + subprocess.run(cmd, stdout=subprocess.DEVNULL, check=True) + logging.getLogger("PPanGGOLiN").debug("Translate sequence ...") + seqdb = tmpdir / 'aa_db' + cmd = list(map(str, ["mmseqs", "translatenucs", seq_nucdb, seqdb, "--threads", cpu, "--translation-table", code])) + logging.getLogger("PPanGGOLiN").debug(" ".join(cmd)) + subprocess.run(cmd, stdout=subprocess.DEVNULL, check=True) + return seqdb + + def write_gene_sequences(pangenome: Pangenome, output: Path, genes: str, soft_core: float = 0.95, compress: bool = False, disable_bar: bool = False): """ @@ -78,6 +96,40 @@ def write_gene_sequences(pangenome: Pangenome, output: Path, genes: str, soft_co logging.getLogger("PPanGGOLiN").info(f"Done writing the gene sequences : '{outpath}'") +def write_gene_protein_sequences(pangenome: Pangenome, output: Path, genes_prot: str, soft_core: float = 0.95, + compress: bool = False, keep_tmp: bool = False, tmp: Path = None, + cpu: int = 1, code: int = 11, disable_bar: bool = False): + """ Write all amino acid sequences from given genes in pangenome + + :param pangenome: Pangenome object with gene families sequences + :param output: Path to output directory + :param genes_prot: Selected partition of gene + :param soft_core: Soft core threshold to use + :param compress: Compress the file in .gz + :param keep_tmp: Keep temporary directory + :param tmp: Path to temporary directory + :param cpu: Number of CPU available + :param code: Genetic code use to translate nucleotide sequences to protein sequences + :param disable_bar: Disable progress bar + """ + tmpdir = tmp/"translateGenes" if tmp is not None else Path(f"{tempfile.gettempdir()}/translateGenes") + mk_outdir(tmpdir, True, True) + + write_gene_sequences(pangenome, tmpdir, genes_prot, soft_core, compress, disable_bar) + + with open(tmpdir/f"{genes_prot}_genes.fna") as sequences: + translate_db = translate_genes(sequences, tmpdir, cpu, code) + outpath = output/f"{genes_prot}_protein_genes.fna" + cmd = list(map(str, ["mmseqs", "convert2fasta", translate_db, outpath])) + logging.getLogger("PPanGGOLiN").debug(" ".join(cmd)) + subprocess.run(cmd, stdout=subprocess.DEVNULL, check=True) + logging.getLogger("PPanGGOLiN").info(f"Done writing the gene sequences : '{outpath}'") + + if not keep_tmp: + logging.getLogger("PPanGGOLiN").debug("Clean temporary directory") + shutil.rmtree(tmpdir) + + def select_families(pangenome: Pangenome, partition: str, type_name: str, soft_core: float = 0.95) -> Set[GeneFamily]: """ function used to filter down families to the given partition @@ -93,13 +145,13 @@ def select_families(pangenome: Pangenome, partition: str, type_name: str, soft_c if partition == 'all': logging.getLogger("PPanGGOLiN").info(f"Writing all of the {type_name}...") genefams = pangenome.gene_families - + elif partition in ['persistent', 'shell', 'cloud']: logging.getLogger("PPanGGOLiN").info(f"Writing the {type_name} of the {partition}...") for fam in pangenome.gene_families: if fam.named_partition == partition: genefams.add(fam) - + elif partition == "rgp": logging.getLogger("PPanGGOLiN").info(f"Writing the {type_name} in RGPs...") for region in pangenome.regions: @@ -342,8 +394,8 @@ def write_regions_sequences(pangenome: Pangenome, output: Path, regions: str, fa def write_sequence_files(pangenome: Pangenome, output: Path, fasta: Path = None, anno: Path = None, - soft_core: float = 0.95, regions: str = None, genes: str = None, gene_families: str = None, - prot_families: str = None, compress: bool = False, disable_bar: bool = False): + soft_core: float = 0.95, regions: str = None, genes: str = None, genes_prot: str = None, + gene_families: str = None, prot_families: str = None, compress: bool = False, disable_bar: bool = False): """ Main function to write sequence file from pangenome @@ -359,7 +411,7 @@ def write_sequence_files(pangenome: Pangenome, output: Path, fasta: Path = None, :param compress: Compress the file in .gz :param disable_bar: Disable progress bar """ - if not any(x for x in [regions, genes, prot_families, gene_families]): + if not any(x for x in [regions, genes, genes_prot, prot_families, gene_families]): raise Exception("You did not indicate what file you wanted to write.") need_annotations = False @@ -375,7 +427,7 @@ def write_sequence_files(pangenome: Pangenome, output: Path, fasta: Path = None, if prot_families in ["core", "softcore"]: need_annotations = True - if any(x is not None for x in [regions, genes, gene_families]): + if any(x is not None for x in [regions, genes, genes_prot, gene_families]): need_annotations = True need_families = True if regions is not None or any(x == "rgp" for x in (genes, gene_families, prot_families)): @@ -394,6 +446,8 @@ def write_sequence_files(pangenome: Pangenome, output: Path, fasta: Path = None, provided_filter = '' if genes is not None: provided_filter = genes + if genes_prot is not None: + provided_filter = genes_prot if gene_families is not None: provided_filter = gene_families if prot_families is not None: @@ -422,6 +476,9 @@ def write_sequence_files(pangenome: Pangenome, output: Path, fasta: Path = None, write_fasta_gene_fam(pangenome, output, gene_families, soft_core, compress, disable_bar) if genes is not None: write_gene_sequences(pangenome, output, genes, soft_core, compress, disable_bar) + if genes_prot is not None: + write_gene_protein_sequences(pangenome, output, genes_prot, soft_core, compress, + disable_bar=disable_bar) if regions is not None: write_regions_sequences(pangenome, output, regions, fasta, anno, compress, disable_bar) @@ -439,7 +496,7 @@ def launch(args: argparse.Namespace): pangenome = Pangenome() pangenome.add_file(args.pangenome) write_sequence_files(pangenome, args.output, fasta=args.fasta, anno=args.anno, soft_core=args.soft_core, - regions=args.regions, genes=args.genes, gene_families=args.gene_families, + regions=args.regions, genes=args.genes, genes_prot=args.genes_prot, gene_families=args.gene_families, prot_families=args.prot_families, compress=args.compress, disable_bar=args.disable_prog_bar) @@ -477,7 +534,7 @@ def parser_seq(parser: argparse.ArgumentParser): :param parser: parser for align argument """ - + required = parser.add_argument_group(title="Required arguments", description="One of the following arguments is required :") required.add_argument('-p', '--pangenome', required=False, type=Path, help="The pangenome .h5 file") @@ -493,7 +550,7 @@ def parser_seq(parser: argparse.ArgumentParser): help="A tab-separated file listing the genome names, and the gff/gbff filepath of its " "annotations (the files can be compressed with gzip). One line per genome. " "If this is provided, those annotations will be used.") - + onereq = parser.add_argument_group(title="Output file", description="At least one of the following argument is required. " "Indicating 'all' writes all elements. Writing a partition " @@ -502,11 +559,13 @@ def parser_seq(parser: argparse.ArgumentParser): ) onereq.add_argument("--genes", required=False, type=filter_values, help=f"Write all nucleotide CDS sequences. {poss_values_log}") + onereq.add_argument("--genes_prot", required=False, type=filter_values, + help=f"Write representative amino acid sequences of genes. {poss_values_log}") onereq.add_argument("--prot_families", required=False, type=filter_values, help=f"Write representative amino acid sequences of gene families. {poss_values_log}") onereq.add_argument("--gene_families", required=False, type=filter_values, help=f"Write representative nucleotide sequences of gene families. {poss_values_log}") - + optional = parser.add_argument_group(title="Optional arguments") # could make choice to allow customization optional.add_argument("--regions", required=False, type=str, choices=["all", "complete"], @@ -515,6 +574,13 @@ def parser_seq(parser: argparse.ArgumentParser): optional.add_argument("--soft_core", required=False, type=restricted_float, default=0.95, help="Soft core threshold to use if 'softcore' partition is chosen") optional.add_argument("--compress", required=False, action="store_true", help="Compress the files in .gz") + optional.add_argument("--translation_table", required=False, default="11", + help="Translation table (genetic code) to use.") + optional.add_argument("-c", "--cpu", required=False, default=1, type=int, help="Number of available cpus") + optional.add_argument("--tmpdir", required=False, type=str, default=Path(tempfile.gettempdir()), + help="directory for storing temporary files") + optional.add_argument("--keep_tmp", required=False, default=False, action="store_true", + help="Keeping temporary files (useful for debugging).") if __name__ == '__main__': From e54565c9adbdc46383b03c78fb9144a0271ac20f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= Date: Thu, 28 Mar 2024 17:25:52 +0100 Subject: [PATCH 02/25] Split function to add check function --- ppanggolin/formats/writeSequences.py | 155 ++++++++++++++++----------- 1 file changed, 93 insertions(+), 62 deletions(-) diff --git a/ppanggolin/formats/writeSequences.py b/ppanggolin/formats/writeSequences.py index 63a78231..a3753d46 100644 --- a/ppanggolin/formats/writeSequences.py +++ b/ppanggolin/formats/writeSequences.py @@ -27,6 +27,94 @@ poss_values_log = f"Possible values are {', '.join(poss_values[:-1])}, module_X with X being a module id." +def check_write_sequences_args(args: argparse.Namespace) -> None: + """Check arguments compatibility in CLI + + :param args: argparse namespace arguments from CLI + + :raises argparse.ArgumentTypeError: if region is given but neither fasta or anno is given + """ + if args.regions is not None and args.fasta is None and args.anno is None: + raise argparse.ArgumentError(argument=None, + message="The --regions options requires the use of --anno or --fasta " + "(You need to provide the same file used to compute the pangenome)") + + +def check_pangenome_to_write_sequences(pangenome: Pangenome, regions: str = None, genes: str = None, + genes_prot: str = None, gene_families: str = None, + prot_families: str = None, disable_bar: bool = False): + """Check and load required information from pangenome + + :param pangenome: Empty pangenome + :param regions: Check and load the RGP + :param genes: Check and load the genes + :param genes_prot: Write amino acid CDS sequences. + :param gene_families: Check and load the gene families to write representative nucleotide sequences. + :param prot_families: Check and load the gene families to write representative amino acid sequences. + :param disable_bar: Disable progress bar + + :raises AssertionError: if not any arguments to write any file is given + :raises ValueError: if the given filter is not recognized + :raises AttributeError: if the pangenome does not contain the required information + """ + if not any(x for x in [regions, genes, genes_prot, prot_families, gene_families]): + raise AssertionError("You did not indicate what file you wanted to write.") + + need_annotations = False + need_families = False + need_graph = False + need_partitions = False + need_spots = False + need_regions = False + need_modules = False + + if prot_families is not None: + need_families = True + if prot_families in ["core", "softcore"]: + need_annotations = True + + if any(x is not None for x in [regions, genes, genes_prot, gene_families]): + need_annotations = True + need_families = True + if regions is not None or any(x == "rgp" for x in (genes, gene_families, prot_families)): + need_annotations = True + need_regions = True + if any(x in ["persistent", "shell", "cloud"] for x in (genes, gene_families, prot_families)): + need_partitions = True + for x in (genes, gene_families, prot_families): + if x is not None and 'module_' in x: + need_modules = True + + if not (need_annotations or need_families or need_graph or need_partitions or + need_spots or need_regions or need_modules): + # then nothing is needed, then something is wrong. + # find which filter was provided + provided_filter = '' + if genes is not None: + provided_filter = genes + if genes_prot is not None: + provided_filter = genes_prot + if gene_families is not None: + provided_filter = gene_families + if prot_families is not None: + provided_filter = prot_families + if regions is not None: + provided_filter = regions + raise ValueError(f"The filter that you indicated '{provided_filter}' was not understood by PPanGGOLiN. " + f"{poss_values_log}") + + if pangenome.status["geneSequences"] not in ["inFile"] and (genes or gene_families): + raise AttributeError("The provided pangenome has no gene sequences. " + "This is not compatible with any of the following options : --genes, --gene_families") + if pangenome.status["geneFamilySequences"] not in ["Loaded", "Computed", "inFile"] and prot_families: + raise AttributeError("The provided pangenome has no gene families. This is not compatible with any of " + "the following options : --prot_families, --gene_families") + + check_pangenome_info(pangenome, need_annotations=need_annotations, need_families=need_families, + need_graph=need_graph, need_partitions=need_partitions, need_rgp=need_regions, + need_spots=need_spots, need_modules=need_modules, disable_bar=disable_bar) + + def write_gene_sequences_from_annotations(genes_to_write: Iterable[Gene], file_obj: TextIO, add: str = '', disable_bar: bool = False): """ @@ -46,7 +134,6 @@ def write_gene_sequences_from_annotations(genes_to_write: Iterable[Gene], file_o file_obj.write(f'{gene.dna}\n') file_obj.flush() - def translate_genes(sequences: TextIO, tmpdir: Path, cpu: int = 1, code: int = 11) -> Path: seq_nucdb = tmpdir / 'nucleotid_sequences_db' cmd = list(map(str, ["mmseqs", "createdb", sequences.name, seq_nucdb])) @@ -395,7 +482,8 @@ def write_regions_sequences(pangenome: Pangenome, output: Path, regions: str, fa def write_sequence_files(pangenome: Pangenome, output: Path, fasta: Path = None, anno: Path = None, soft_core: float = 0.95, regions: str = None, genes: str = None, genes_prot: str = None, - gene_families: str = None, prot_families: str = None, compress: bool = False, disable_bar: bool = False): + gene_families: str = None, prot_families: str = None, compress: bool = False, + disable_bar: bool = False): """ Main function to write sequence file from pangenome @@ -406,69 +494,14 @@ def write_sequence_files(pangenome: Pangenome, output: Path, fasta: Path = None, :param soft_core: Soft core threshold to use :param regions: Write the RGP nucleotide sequences :param genes: Write all nucleotide CDS sequences + :param genes_prot: Write amino acid CDS sequences. :param gene_families: Write representative nucleotide sequences of gene families. :param prot_families: Write representative amino acid sequences of gene families. :param compress: Compress the file in .gz :param disable_bar: Disable progress bar """ - if not any(x for x in [regions, genes, genes_prot, prot_families, gene_families]): - raise Exception("You did not indicate what file you wanted to write.") - need_annotations = False - need_families = False - need_graph = False - need_partitions = False - need_spots = False - need_regions = False - need_modules = False - - if prot_families is not None: - need_families = True - if prot_families in ["core", "softcore"]: - need_annotations = True - - if any(x is not None for x in [regions, genes, genes_prot, gene_families]): - need_annotations = True - need_families = True - if regions is not None or any(x == "rgp" for x in (genes, gene_families, prot_families)): - need_annotations = True - need_regions = True - if any(x in ["persistent", "shell", "cloud"] for x in (genes, gene_families, prot_families)): - need_partitions = True - for x in (genes, gene_families, prot_families): - if x is not None and 'module_' in x: - need_modules = True - - if not (need_annotations or need_families or need_graph or need_partitions or - need_spots or need_regions or need_modules): - # then nothing is needed, then something is wrong. - # find which filter was provided - provided_filter = '' - if genes is not None: - provided_filter = genes - if genes_prot is not None: - provided_filter = genes_prot - if gene_families is not None: - provided_filter = gene_families - if prot_families is not None: - provided_filter = prot_families - if regions is not None: - provided_filter = regions - raise Exception(f"The filter that you indicated '{provided_filter}' was not understood by PPanGGOLiN. " - f"{poss_values_log}") - ex_gene_sequences = Exception("The provided pangenome has no gene sequences. " - "This is not compatible with any of the following options : --genes, --gene_families") - ex_gene_family_sequences = Exception("The provided pangenome has no gene families. " - "This is not compatible with any of the following options : " - "--prot_families, --gene_families") - if pangenome.status["geneSequences"] not in ["inFile"] and (genes or gene_families): - raise ex_gene_sequences - if pangenome.status["geneFamilySequences"] not in ["Loaded", "Computed", "inFile"] and prot_families: - raise ex_gene_family_sequences - - check_pangenome_info(pangenome, need_annotations=need_annotations, need_families=need_families, - need_graph=need_graph, need_partitions=need_partitions, need_rgp=need_regions, - need_spots=need_spots, need_modules=need_modules, disable_bar=disable_bar) + check_pangenome_to_write_sequences(pangenome, regions, genes, genes_prot, gene_families, prot_families, disable_bar) if prot_families is not None: write_fasta_prot_fam(pangenome, output, prot_families, soft_core, compress, disable_bar) @@ -489,9 +522,7 @@ def launch(args: argparse.Namespace): :param args: All arguments provide by user """ - if args.regions is not None and args.fasta is None and args.anno is None: - raise Exception("The --regions options requires the use of --anno or --fasta " - "(You need to provide the same file used to compute the pangenome)") + check_write_sequences_args(args) mk_outdir(args.output, args.force) pangenome = Pangenome() pangenome.add_file(args.pangenome) From 035330ecaafe34544579d69cb03e6d5e0c17d96f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= Date: Thu, 28 Mar 2024 17:34:49 +0100 Subject: [PATCH 03/25] Little refactoring --- ppanggolin/formats/writeSequences.py | 47 +++++++++++++++++++--------- 1 file changed, 33 insertions(+), 14 deletions(-) diff --git a/ppanggolin/formats/writeSequences.py b/ppanggolin/formats/writeSequences.py index a3753d46..c68eac5e 100644 --- a/ppanggolin/formats/writeSequences.py +++ b/ppanggolin/formats/writeSequences.py @@ -18,11 +18,11 @@ from ppanggolin.pangenome import Pangenome from ppanggolin.geneFamily import GeneFamily from ppanggolin.genome import Gene, Organism -from ppanggolin.utils import write_compressed_or_not, mk_outdir, read_compressed_or_not, restricted_float, detect_filetype +from ppanggolin.utils import write_compressed_or_not, mk_outdir, read_compressed_or_not, restricted_float, \ + detect_filetype from ppanggolin.formats.readBinaries import check_pangenome_info, get_gene_sequences_from_file - -module_regex = re.compile(r'^module_\d+') #\d == [0-9] +module_regex = re.compile(r'^module_\d+') # \d == [0-9] poss_values = ['all', 'persistent', 'shell', 'cloud', 'rgp', 'softcore', 'core', module_regex] poss_values_log = f"Possible values are {', '.join(poss_values[:-1])}, module_X with X being a module id." @@ -134,8 +134,18 @@ def write_gene_sequences_from_annotations(genes_to_write: Iterable[Gene], file_o file_obj.write(f'{gene.dna}\n') file_obj.flush() + def translate_genes(sequences: TextIO, tmpdir: Path, cpu: int = 1, code: int = 11) -> Path: - seq_nucdb = tmpdir / 'nucleotid_sequences_db' + """Translate nucleotide sequences into MMSeqs2 amino acid sequences database + + :param sequences: File with the nucleotide sequences + :param tmpdir: Temporary directory to save the MMSeqs2 files + :param cpu: Number of available CPU cores to use + :param code: Translation code to use + + :return: Path to the MMSeqs2 database + """ + seq_nucdb = tmpdir / 'nucleotide_sequences_db' cmd = list(map(str, ["mmseqs", "createdb", sequences.name, seq_nucdb])) logging.getLogger("PPanGGOLiN").debug(" ".join(cmd)) logging.getLogger("PPanGGOLiN").info("Creating sequence database...") @@ -159,6 +169,8 @@ def write_gene_sequences(pangenome: Pangenome, output: Path, genes: str, soft_co :param soft_core: Soft core threshold to use :param compress: Compress the file in .gz :param disable_bar: Disable progress bar + + :raises AttributeError: If the pangenome does not contain gene sequences """ logging.getLogger("PPanGGOLiN").info("Writing all the gene nucleotide sequences...") @@ -179,7 +191,7 @@ def write_gene_sequences(pangenome: Pangenome, output: Path, genes: str, soft_co write_gene_sequences_from_annotations(genes_to_write, fasta, disable_bar=disable_bar) else: # this should never happen if the pangenome has been properly checked before launching this function. - raise Exception("The pangenome does not include gene sequences") + raise AttributeError("The pangenome does not include gene sequences") logging.getLogger("PPanGGOLiN").info(f"Done writing the gene sequences : '{outpath}'") @@ -199,14 +211,14 @@ def write_gene_protein_sequences(pangenome: Pangenome, output: Path, genes_prot: :param code: Genetic code use to translate nucleotide sequences to protein sequences :param disable_bar: Disable progress bar """ - tmpdir = tmp/"translateGenes" if tmp is not None else Path(f"{tempfile.gettempdir()}/translateGenes") + tmpdir = tmp / "translateGenes" if tmp is not None else Path(f"{tempfile.gettempdir()}/translateGenes") mk_outdir(tmpdir, True, True) write_gene_sequences(pangenome, tmpdir, genes_prot, soft_core, compress, disable_bar) - with open(tmpdir/f"{genes_prot}_genes.fna") as sequences: + with open(tmpdir / f"{genes_prot}_genes.fna") as sequences: translate_db = translate_genes(sequences, tmpdir, cpu, code) - outpath = output/f"{genes_prot}_protein_genes.fna" + outpath = output / f"{genes_prot}_protein_genes.fna" cmd = list(map(str, ["mmseqs", "convert2fasta", translate_db, outpath])) logging.getLogger("PPanGGOLiN").debug(" ".join(cmd)) subprocess.run(cmd, stdout=subprocess.DEVNULL, check=True) @@ -396,6 +408,9 @@ def read_genome_file(genome_file: Path, organism: Organism) -> Dict[str, str]: :param organism: organism object :return: Dictionary with all sequences associated to contig + + :raises TypeError: If the file containing sequences is not recognized + :raises KeyError: If their inconsistency between pangenome contigs and the given contigs """ filetype = detect_filetype(genome_file) if filetype in ["fasta", "gff"]: @@ -403,12 +418,12 @@ def read_genome_file(genome_file: Path, organism: Organism) -> Dict[str, str]: elif filetype == "gbff": contig_to_sequence = read_fasta_gbk(genome_file) else: - raise Exception(f"Unknown filetype detected: '{genome_file}'") + raise TypeError(f"Unknown filetype detected: '{genome_file}'") # check_contig_names if set(contig_to_sequence) != {contig.name for contig in organism.contigs}: - raise Exception(f"Contig name inconsistency detected in genome '{organism.name}' between the " - f"information stored in the pangenome file and the contigs found in '{genome_file}'.") + raise KeyError(f"Contig name inconsistency detected in genome '{organism.name}' between the " + f"information stored in the pangenome file and the contigs found in '{genome_file}'.") return contig_to_sequence @@ -420,7 +435,7 @@ def write_spaced_fasta(sequence: str, space: int = 60) -> str: :param sequence: sequence to write :param space: maximum of size for one line - :return: a sequence of maximum space caracter + :return: a sequence of maximum space character """ seq = "" j = 0 @@ -442,6 +457,8 @@ def write_regions_sequences(pangenome: Pangenome, output: Path, regions: str, fa :param anno: A tab-separated file listing the organism names, and the gff/gbff filepath of its annotations :param compress: Compress the file in .gz :param disable_bar: Disable progress bar + + :raises SyntaxError: if no tabulation are found in list genomes file """ assert fasta is not None or anno is not None, "Write regions requires to use anno or fasta, not any provided" @@ -450,7 +467,7 @@ def write_regions_sequences(pangenome: Pangenome, output: Path, regions: str, fa for line in read_compressed_or_not(organisms_file): elements = [el.strip() for el in line.split("\t")] if len(elements) <= 1: - raise Exception(f"No tabulation separator found in given --fasta or --anno file: '{organisms_file}'") + raise SyntaxError(f"No tabulation separator found in given --fasta or --anno file: '{organisms_file}'") org_dict[elements[0]] = Path(elements[1]) if not org_dict[elements[0]].exists(): # Check tsv sanity test if it's not one it's the other org_dict[elements[0]] = organisms_file.parent.joinpath(org_dict[elements[0]]) @@ -527,7 +544,8 @@ def launch(args: argparse.Namespace): pangenome = Pangenome() pangenome.add_file(args.pangenome) write_sequence_files(pangenome, args.output, fasta=args.fasta, anno=args.anno, soft_core=args.soft_core, - regions=args.regions, genes=args.genes, genes_prot=args.genes_prot, gene_families=args.gene_families, + regions=args.regions, genes=args.genes, genes_prot=args.genes_prot, + gene_families=args.gene_families, prot_families=args.prot_families, compress=args.compress, disable_bar=args.disable_prog_bar) @@ -543,6 +561,7 @@ def subparser(sub_parser: argparse._SubParsersAction) -> argparse.ArgumentParser parser_seq(parser) return parser + def filter_values(arg_value: str): """ Check filter value to ensure they are in the expected format. From 788cd5cfc471c844dbf3c5452dcc43472acb3076 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= Date: Thu, 28 Mar 2024 17:42:33 +0100 Subject: [PATCH 04/25] Add translate arguments as kwargs --- ppanggolin/formats/writeSequences.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/ppanggolin/formats/writeSequences.py b/ppanggolin/formats/writeSequences.py index c68eac5e..379601a1 100644 --- a/ppanggolin/formats/writeSequences.py +++ b/ppanggolin/formats/writeSequences.py @@ -500,7 +500,7 @@ def write_regions_sequences(pangenome: Pangenome, output: Path, regions: str, fa def write_sequence_files(pangenome: Pangenome, output: Path, fasta: Path = None, anno: Path = None, soft_core: float = 0.95, regions: str = None, genes: str = None, genes_prot: str = None, gene_families: str = None, prot_families: str = None, compress: bool = False, - disable_bar: bool = False): + disable_bar: bool = False, **translate_kwgs): """ Main function to write sequence file from pangenome @@ -528,7 +528,7 @@ def write_sequence_files(pangenome: Pangenome, output: Path, fasta: Path = None, write_gene_sequences(pangenome, output, genes, soft_core, compress, disable_bar) if genes_prot is not None: write_gene_protein_sequences(pangenome, output, genes_prot, soft_core, compress, - disable_bar=disable_bar) + disable_bar=disable_bar, **translate_kwgs) if regions is not None: write_regions_sequences(pangenome, output, regions, fasta, anno, compress, disable_bar) @@ -540,13 +540,17 @@ def launch(args: argparse.Namespace): :param args: All arguments provide by user """ check_write_sequences_args(args) + translate_kwgs = {"code": args.translation_table, + "cpu": args.cpu, + "tmp": args.tmpdir, + "keep_tmp": args.keep_tmp} mk_outdir(args.output, args.force) pangenome = Pangenome() pangenome.add_file(args.pangenome) write_sequence_files(pangenome, args.output, fasta=args.fasta, anno=args.anno, soft_core=args.soft_core, regions=args.regions, genes=args.genes, genes_prot=args.genes_prot, - gene_families=args.gene_families, - prot_families=args.prot_families, compress=args.compress, disable_bar=args.disable_prog_bar) + gene_families=args.gene_families, prot_families=args.prot_families, compress=args.compress, + disable_bar=args.disable_prog_bar, **translate_kwgs) def subparser(sub_parser: argparse._SubParsersAction) -> argparse.ArgumentParser: @@ -627,7 +631,7 @@ def parser_seq(parser: argparse.ArgumentParser): optional.add_argument("--translation_table", required=False, default="11", help="Translation table (genetic code) to use.") optional.add_argument("-c", "--cpu", required=False, default=1, type=int, help="Number of available cpus") - optional.add_argument("--tmpdir", required=False, type=str, default=Path(tempfile.gettempdir()), + optional.add_argument("--tmpdir", required=False, type=Path, default=Path(tempfile.gettempdir()), help="directory for storing temporary files") optional.add_argument("--keep_tmp", required=False, default=False, action="store_true", help="Keeping temporary files (useful for debugging).") From 9cb7b3155dbce1407288e232854a3904ee70de78 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= Date: Thu, 28 Mar 2024 17:56:18 +0100 Subject: [PATCH 05/25] Replace cpu by threads to be more accurate --- ppanggolin/formats/writeSequences.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/ppanggolin/formats/writeSequences.py b/ppanggolin/formats/writeSequences.py index 379601a1..9e03f87b 100644 --- a/ppanggolin/formats/writeSequences.py +++ b/ppanggolin/formats/writeSequences.py @@ -32,7 +32,7 @@ def check_write_sequences_args(args: argparse.Namespace) -> None: :param args: argparse namespace arguments from CLI - :raises argparse.ArgumentTypeError: if region is given but neither fasta or anno is given + :raises argparse.ArgumentTypeError: if region is given but neither fasta nor anno is given """ if args.regions is not None and args.fasta is None and args.anno is None: raise argparse.ArgumentError(argument=None, @@ -135,12 +135,12 @@ def write_gene_sequences_from_annotations(genes_to_write: Iterable[Gene], file_o file_obj.flush() -def translate_genes(sequences: TextIO, tmpdir: Path, cpu: int = 1, code: int = 11) -> Path: +def translate_genes(sequences: TextIO, tmpdir: Path, threads: int = 1, code: int = 11) -> Path: """Translate nucleotide sequences into MMSeqs2 amino acid sequences database :param sequences: File with the nucleotide sequences :param tmpdir: Temporary directory to save the MMSeqs2 files - :param cpu: Number of available CPU cores to use + :param threads: Number of available threads to use :param code: Translation code to use :return: Path to the MMSeqs2 database @@ -152,7 +152,7 @@ def translate_genes(sequences: TextIO, tmpdir: Path, cpu: int = 1, code: int = 1 subprocess.run(cmd, stdout=subprocess.DEVNULL, check=True) logging.getLogger("PPanGGOLiN").debug("Translate sequence ...") seqdb = tmpdir / 'aa_db' - cmd = list(map(str, ["mmseqs", "translatenucs", seq_nucdb, seqdb, "--threads", cpu, "--translation-table", code])) + cmd = list(map(str, ["mmseqs", "translatenucs", seq_nucdb, seqdb, "--threads", threads, "--translation-table", code])) logging.getLogger("PPanGGOLiN").debug(" ".join(cmd)) subprocess.run(cmd, stdout=subprocess.DEVNULL, check=True) return seqdb @@ -197,7 +197,7 @@ def write_gene_sequences(pangenome: Pangenome, output: Path, genes: str, soft_co def write_gene_protein_sequences(pangenome: Pangenome, output: Path, genes_prot: str, soft_core: float = 0.95, compress: bool = False, keep_tmp: bool = False, tmp: Path = None, - cpu: int = 1, code: int = 11, disable_bar: bool = False): + threads: int = 1, code: int = 11, disable_bar: bool = False): """ Write all amino acid sequences from given genes in pangenome :param pangenome: Pangenome object with gene families sequences @@ -207,7 +207,7 @@ def write_gene_protein_sequences(pangenome: Pangenome, output: Path, genes_prot: :param compress: Compress the file in .gz :param keep_tmp: Keep temporary directory :param tmp: Path to temporary directory - :param cpu: Number of CPU available + :param threads: Number of threads available :param code: Genetic code use to translate nucleotide sequences to protein sequences :param disable_bar: Disable progress bar """ @@ -217,7 +217,7 @@ def write_gene_protein_sequences(pangenome: Pangenome, output: Path, genes_prot: write_gene_sequences(pangenome, tmpdir, genes_prot, soft_core, compress, disable_bar) with open(tmpdir / f"{genes_prot}_genes.fna") as sequences: - translate_db = translate_genes(sequences, tmpdir, cpu, code) + translate_db = translate_genes(sequences, tmpdir, threads, code) outpath = output / f"{genes_prot}_protein_genes.fna" cmd = list(map(str, ["mmseqs", "convert2fasta", translate_db, outpath])) logging.getLogger("PPanGGOLiN").debug(" ".join(cmd)) @@ -541,7 +541,7 @@ def launch(args: argparse.Namespace): """ check_write_sequences_args(args) translate_kwgs = {"code": args.translation_table, - "cpu": args.cpu, + "threads": args.threads, "tmp": args.tmpdir, "keep_tmp": args.keep_tmp} mk_outdir(args.output, args.force) @@ -630,7 +630,7 @@ def parser_seq(parser: argparse.ArgumentParser): optional.add_argument("--compress", required=False, action="store_true", help="Compress the files in .gz") optional.add_argument("--translation_table", required=False, default="11", help="Translation table (genetic code) to use.") - optional.add_argument("-c", "--cpu", required=False, default=1, type=int, help="Number of available cpus") + optional.add_argument("--threads", required=False, default=1, type=int, help="Number of available threads") optional.add_argument("--tmpdir", required=False, type=Path, default=Path(tempfile.gettempdir()), help="directory for storing temporary files") optional.add_argument("--keep_tmp", required=False, default=False, action="store_true", From 12186a5ba59a8239491d7f817e1a2d99fb295b3b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= Date: Thu, 28 Mar 2024 17:56:30 +0100 Subject: [PATCH 06/25] Write test and documentation for the new option --- .github/workflows/main.yml | 1 + docs/user/writeFasta.md | 28 +++++++++++++++++++++++----- 2 files changed, 24 insertions(+), 5 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 67c4ad2d..5db304dd 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -77,6 +77,7 @@ jobs: ppanggolin fasta -p stepbystep/pangenome.h5 --output stepbystep -f --prot_families module_0 ppanggolin fasta -p stepbystep/pangenome.h5 --output stepbystep -f --prot_families core ppanggolin fasta -p stepbystep/pangenome.h5 --output stepbystep -f --gene_families module_0 --genes module_0 + ppanggolin fasta -p stepbystep/pangenome.h5 --output stepbystep -f --genes_prot -c 1 --keep_tmp ppanggolin draw -p stepbystep/pangenome.h5 --draw_spots --spots all -o stepbystep -f ppanggolin metrics -p stepbystep/pangenome.h5 --genome_fluidity --no_print_info --recompute_metrics --log metrics.log diff --git a/docs/user/writeFasta.md b/docs/user/writeFasta.md index ac1ece26..e0ba5013 100644 --- a/docs/user/writeFasta.md +++ b/docs/user/writeFasta.md @@ -18,7 +18,9 @@ When using the `softcore` filter, the `--soft_core` option can be used to modify ## Genes -This option can be used to write the nucleotide CDS sequences. It can be used as such, to write all of the genes of the pangenome for example: +### Nucleotide sequences + +This option can be used to write the nucleotide CDS sequences. It can be used as such, to write all the genes of the pangenome for example: ```bash ppanggolin fasta -p pangenome.h5 --output MY_GENES --genes all @@ -30,8 +32,25 @@ Or to write only the persistent genes: ppanggolin fasta -p pangenome.h5 --output MY_GENES --genes persistent ``` +### Protein sequences + +This option can be used to write the amino acid CDS sequences. It can be used as such, to write all the genes of the pangenome for example: + +```bash +ppanggolin fasta -p pangenome.h5 --output MY_GENES --genes_prot all +``` + +Or to write only the cloud genes: + +```bash +ppanggolin fasta -p pangenome.h5 --output MY_GENES --genes_prot cloud +``` + +To translate the genes sequences, PPanGGOLiN use [MMSeqs2](https://github.com/soedinglab/MMseqs2) `translatenucs` command. So for this option you can give multiple threads with `--threads`. You can also specify the translation table to use with `--translate_table`. Finally, you can keep the [MMSeqs2](https://github.com/soedinglab/MMseqs2) that are generated in the temporary directory (that you can also be specified with `--tmpdir`) by indicate the option `--keep_tmp`. -## Protein families +## Gene families + +### Protein sequences This option can be used to write the protein sequences of the representative sequences for each family. It can be used as such for all families: @@ -39,14 +58,13 @@ This option can be used to write the protein sequences of the representative seq ppanggolin fasta -p pangenome.h5 --output MY_PROT --prot_families all ``` -or for all of the shell families for example: +or for all the shell families for example: ```bash ppanggolin fasta -p pangenome.h5 --output MY_PROT --prot_families shell ``` - -## Gene families +### Nucleotide sequences This option can be used to write the gene sequences of the representative sequences for each family. It can be used as such: From 7febe1bb20c77fe1eab8531e1b1d291f89cf54bc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= Date: Fri, 29 Mar 2024 15:04:59 +0100 Subject: [PATCH 07/25] Fix call new option for GitHub action --- .github/workflows/main.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 5db304dd..26c79513 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -77,7 +77,7 @@ jobs: ppanggolin fasta -p stepbystep/pangenome.h5 --output stepbystep -f --prot_families module_0 ppanggolin fasta -p stepbystep/pangenome.h5 --output stepbystep -f --prot_families core ppanggolin fasta -p stepbystep/pangenome.h5 --output stepbystep -f --gene_families module_0 --genes module_0 - ppanggolin fasta -p stepbystep/pangenome.h5 --output stepbystep -f --genes_prot -c 1 --keep_tmp + ppanggolin fasta -p stepbystep/pangenome.h5 --output stepbystep -f --genes_prot cloud -c 1 --keep_tmp ppanggolin draw -p stepbystep/pangenome.h5 --draw_spots --spots all -o stepbystep -f ppanggolin metrics -p stepbystep/pangenome.h5 --genome_fluidity --no_print_info --recompute_metrics --log metrics.log From ee283cb0a7846f517fc31eaeebe0be1374f0a398 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= Date: Fri, 29 Mar 2024 15:55:52 +0100 Subject: [PATCH 08/25] Fix call new option for GitHub action --- .github/workflows/main.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 26c79513..b0ccea1b 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -77,7 +77,7 @@ jobs: ppanggolin fasta -p stepbystep/pangenome.h5 --output stepbystep -f --prot_families module_0 ppanggolin fasta -p stepbystep/pangenome.h5 --output stepbystep -f --prot_families core ppanggolin fasta -p stepbystep/pangenome.h5 --output stepbystep -f --gene_families module_0 --genes module_0 - ppanggolin fasta -p stepbystep/pangenome.h5 --output stepbystep -f --genes_prot cloud -c 1 --keep_tmp + ppanggolin fasta -p stepbystep/pangenome.h5 --output stepbystep -f --genes_prot cloud --threads 1 --keep_tmp ppanggolin draw -p stepbystep/pangenome.h5 --draw_spots --spots all -o stepbystep -f ppanggolin metrics -p stepbystep/pangenome.h5 --genome_fluidity --no_print_info --recompute_metrics --log metrics.log From 922073d725b9c9d39ec54151d83cd39976fd4f7f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= Date: Wed, 10 Apr 2024 11:01:03 +0200 Subject: [PATCH 09/25] Refactoring --- ppanggolin/annotate/synta.py | 4 ++-- ppanggolin/formats/writeMSA.py | 21 +++++++++++---------- 2 files changed, 13 insertions(+), 12 deletions(-) diff --git a/ppanggolin/annotate/synta.py b/ppanggolin/annotate/synta.py index d1eaa428..e4cdaf0b 100644 --- a/ppanggolin/annotate/synta.py +++ b/ppanggolin/annotate/synta.py @@ -301,9 +301,9 @@ def get_dna_sequence(contig_seq: str, gene: Union[Gene, RNA]) -> str: :return: str """ if gene.strand == "+": - return contig_seq[gene.start - 1:gene.stop] + return contig_seq[gene.start - 1: gene.stop] elif gene.strand == "-": - return reverse_complement(contig_seq[gene.start - 1:gene.stop]) + return reverse_complement(contig_seq[gene.start - 1: gene.stop]) def annotate_organism(org_name: str, file_name: Path, circular_contigs: List[str], tmpdir: str, diff --git a/ppanggolin/formats/writeMSA.py b/ppanggolin/formats/writeMSA.py index 0e419205..00f974f2 100644 --- a/ppanggolin/formats/writeMSA.py +++ b/ppanggolin/formats/writeMSA.py @@ -15,6 +15,7 @@ from tqdm import tqdm # local libraries +from ppanggolin.genome import Gene from ppanggolin.geneFamily import GeneFamily from ppanggolin.pangenome import Pangenome from ppanggolin.utils import mk_outdir, restricted_float @@ -95,10 +96,10 @@ def get_families_to_write(pangenome: Pangenome, partition_filter: str = "core", return families -def translate(seq: str, code: Dict[str, Dict[str, str]]) -> str: +def translate(gene: Gene, code: Dict[str, Dict[str, str]]) -> str: """translates the given dna sequence with the given translation table - :param seq: given dna sequence + :param gene: given gene :param code: translation table corresponding to genetic code to use :return: protein sequence @@ -106,17 +107,17 @@ def translate(seq: str, code: Dict[str, Dict[str, str]]) -> str: # code: https://www.bioinformatics.org/sms/iupac.html start_table = code["start_table"] table = code["trans_table"] - - if len(seq) % 3 == 0: - protein = start_table[seq[0: 3]] - for i in range(3, len(seq), 3): - codon = seq[i: i + 3] + if len(gene.dna) % 3 == 0: + protein = start_table[gene.dna[0: 3]] + for i in range(3, len(gene.dna), 3): + codon = gene.dna[i: i + 3] try: protein += table[codon] except KeyError: # codon was not planned for. Probably can't determine it. protein += 'X' # X is for unknown else: - raise IndexError("Given sequence length modulo 3 was different than 0, which is unexpected.") + logging.getLogger("PPANGGOLIN").error(f"Found gene length is {len(gene.dna)}") + raise IndexError(f"Gene {gene.ID} sequence length modulo 3 was different than 0, which is unexpected.") return protein @@ -150,7 +151,7 @@ def write_fasta_families(family: GeneFamily, tmpdir: tempfile.TemporaryDirectory if source == "dna": f_obj.write(gene.dna + '\n') elif source == "protein": - f_obj.write(translate(gene.dna, code_table) + "\n") + f_obj.write(translate(gene, code_table) + "\n") else: raise AssertionError("Unknown sequence source given (expected 'dna' or 'protein')") f_obj.flush() @@ -229,7 +230,7 @@ def write_whole_genome_msa(pangenome: Pangenome, families: set, phylo_name: Path """ # sort familes by ID, so the gene order is consistent - families = sorted(families, key = lambda fam: fam.ID) + families = sorted(families, key=lambda f: f.ID) phylo_dict = {} for org in pangenome.organisms: From 9aaac235a22b05a3fd1fa89ccaa2ebbd4870b0ac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= Date: Wed, 10 Apr 2024 14:34:35 +0200 Subject: [PATCH 10/25] Make gene translation more flexible by cutting off the end of genes that are not modulo 3 --- ppanggolin/formats/writeMSA.py | 49 +++++++++++++++++++++------------- 1 file changed, 31 insertions(+), 18 deletions(-) diff --git a/ppanggolin/formats/writeMSA.py b/ppanggolin/formats/writeMSA.py index 00f974f2..e1382316 100644 --- a/ppanggolin/formats/writeMSA.py +++ b/ppanggolin/formats/writeMSA.py @@ -96,7 +96,7 @@ def get_families_to_write(pangenome: Pangenome, partition_filter: str = "core", return families -def translate(gene: Gene, code: Dict[str, Dict[str, str]]) -> str: +def translate(gene: Gene, code: Dict[str, Dict[str, str]]) -> Tuple[str, bool]: """translates the given dna sequence with the given translation table :param gene: given gene @@ -107,22 +107,25 @@ def translate(gene: Gene, code: Dict[str, Dict[str, str]]) -> str: # code: https://www.bioinformatics.org/sms/iupac.html start_table = code["start_table"] table = code["trans_table"] - if len(gene.dna) % 3 == 0: - protein = start_table[gene.dna[0: 3]] - for i in range(3, len(gene.dna), 3): - codon = gene.dna[i: i + 3] - try: - protein += table[codon] - except KeyError: # codon was not planned for. Probably can't determine it. - protein += 'X' # X is for unknown - else: - logging.getLogger("PPANGGOLIN").error(f"Found gene length is {len(gene.dna)}") - raise IndexError(f"Gene {gene.ID} sequence length modulo 3 was different than 0, which is unexpected.") - return protein + mod = len(gene.dna) % 3 + partial = False + if mod != 0: + partial = True + msg = (f"Gene {gene.ID} {'' if gene.local_identifier == '' else 'with local identifier ' + gene.local_identifier}" + f" has a sequence length of {len(gene.dna)} which modulo 3 was different than 0.") + logging.getLogger("PPANGGOLIN").debug(msg) + protein = start_table[gene.dna[0: 3]] + for i in range(3, len(gene.dna) - mod, 3): + codon = gene.dna[i: i + 3] + try: + protein += table[codon] + except KeyError: # codon was not planned for. Probably can't determine it. + protein += 'X' # X is for unknown + return protein, partial def write_fasta_families(family: GeneFamily, tmpdir: tempfile.TemporaryDirectory, code_table: Dict[str, Dict[str, str]], - source: str = 'protein', use_gene_id: bool = False) -> Path: + source: str = 'protein', use_gene_id: bool = False) -> Tuple[Path, bool]: """Write fasta files for each gene family :param family: gene family to write @@ -143,6 +146,7 @@ def write_fasta_families(family: GeneFamily, tmpdir: tempfile.TemporaryDirectory if len(genes) == 1: single_copy_genes.extend(genes) + partial = False for gene in single_copy_genes: if use_gene_id: f_obj.write('>' + gene.ID + "\n") @@ -151,12 +155,14 @@ def write_fasta_families(family: GeneFamily, tmpdir: tempfile.TemporaryDirectory if source == "dna": f_obj.write(gene.dna + '\n') elif source == "protein": - f_obj.write(translate(gene, code_table) + "\n") + protein, part = translate(gene, code_table) + if not partial: + partial = part + f_obj.write(protein + "\n") else: raise AssertionError("Unknown sequence source given (expected 'dna' or 'protein')") f_obj.flush() - - return f_name + return f_name, partial def launch_mafft(fname: Path, output: Path, fam_name: str): @@ -204,12 +210,19 @@ def compute_msa(families: Set[GeneFamily], output: Path, tmpdir: Path, cpu: int logging.getLogger("PPanGGOLiN").info("Preparing input files for MSA...") code_table = genetic_codes(code) + partial = False for family in tqdm(families, unit="family", disable=disable_bar): start_write = time.time() - fname = write_fasta_families(family, newtmpdir, code_table, source, use_gene_id) + fname, part = write_fasta_families(family, newtmpdir, code_table, source, use_gene_id) + if not partial: + partial = part write_total = write_total + (time.time() - start_write) args.append((fname, output, family.name)) + if partial: + logging.getLogger("PPanGGOLiN").warning("Partial gene was found during translation. " + "Last nucleotides were removed to translate. " + "Use --verbose 2 to see genes that are partial") logging.getLogger("PPanGGOLiN").info("Computing the MSA ...") with get_context('fork').Pool(cpu) as p: with tqdm(total=len(families), unit="family", disable=disable_bar) as bar: From 8129bfab155e01e3bcf6e380b84a134f598ef13f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= Date: Mon, 27 May 2024 11:07:28 +0200 Subject: [PATCH 11/25] Fix unset variable --- ppanggolin/formats/writeMSA.py | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/ppanggolin/formats/writeMSA.py b/ppanggolin/formats/writeMSA.py index d59cac81..b4f35f01 100644 --- a/ppanggolin/formats/writeMSA.py +++ b/ppanggolin/formats/writeMSA.py @@ -111,8 +111,9 @@ def translate(gene: Gene, code: Dict[str, Dict[str, str]]) -> Tuple[str, bool]: partial = False if mod != 0: partial = True - msg = (f"Gene {gene.ID} {'' if gene.local_identifier == '' else 'with local identifier ' + gene.local_identifier}" - f" has a sequence length of {len(gene.dna)} which modulo 3 was different than 0.") + msg = ( + f"Gene {gene.ID} {'' if gene.local_identifier == '' else 'with local identifier ' + gene.local_identifier}" + f" has a sequence length of {len(gene.dna)} which modulo 3 was different than 0.") logging.getLogger("PPANGGOLIN").debug(msg) protein = start_table[gene.dna[0: 3]] for i in range(3, len(gene.dna) - mod, 3): @@ -145,9 +146,9 @@ def write_fasta_families(family: GeneFamily, tmpdir: tempfile.TemporaryDirectory for _, genes in family.get_org_dict().items(): if len(genes) == 1: single_copy_genes.extend(genes) - - with open(f_name, "w") as f_obj: + with open(f_name, "w") as f_obj: + partial = False for gene in single_copy_genes: if use_gene_id: f_obj.write(f">{gene.ID}\n") @@ -156,10 +157,10 @@ def write_fasta_families(family: GeneFamily, tmpdir: tempfile.TemporaryDirectory if source == "dna": f_obj.write(gene.dna + '\n') elif source == "protein": - protein, part = translate(gene, code_table) - if not partial: - partial = part - f_obj.write(protein + "\n") + protein, part = translate(gene, code_table) + if not partial: + partial = part + f_obj.write(protein + "\n") else: raise ValueError(f"Unknown sequence source '{source}' provided. Expected 'dna' or 'protein'.") @@ -179,12 +180,12 @@ def launch_mafft(fname: Path, output: Path, fam_name: str): logging.getLogger("PPanGGOLiN").debug("command: " + " ".join(cmd)) result = subprocess.run(cmd, capture_output=True) - with open(outname, "w") as fout: + with open(outname, "w") as fout: fout.write(result.stdout.decode('utf-8')) if result.stderr and result.returncode != 0: raise Exception("mafft failed with the following error:\n" - f"{result.stderr.decode('utf-8')}") + f"{result.stderr.decode('utf-8')}") def launch_multi_mafft(args: List[Tuple[Path, Path, str]]): From 02b201a6847b6a61ec3d982daceba07997d699f1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= Date: Tue, 4 Jun 2024 15:36:40 +0200 Subject: [PATCH 12/25] Replace translate_with_mmseqs by translate gene --- ppanggolin/align/alignOnPang.py | 104 +++++++++------------------ ppanggolin/cluster/cluster.py | 2 +- ppanggolin/formats/writeSequences.py | 36 +++++++--- 3 files changed, 60 insertions(+), 82 deletions(-) diff --git a/ppanggolin/align/alignOnPang.py b/ppanggolin/align/alignOnPang.py index 247545ae..c4c7bc56 100644 --- a/ppanggolin/align/alignOnPang.py +++ b/ppanggolin/align/alignOnPang.py @@ -9,7 +9,7 @@ import subprocess import argparse from collections import defaultdict, Counter -from typing import List, Tuple, Set, Dict, IO, Iterable +from typing import List, Tuple, Set, Dict, TextIO, Iterable from pathlib import Path from tqdm import tqdm @@ -22,52 +22,10 @@ from ppanggolin.region import Spot from ppanggolin.figures.draw_spot import draw_selected_spots, subgraph from ppanggolin.formats.readBinaries import get_non_redundant_gene_sequences_from_file +from ppanggolin.formats.writeSequences import translate_genes, create_mmseqs_db -def create_mmseqs_db(seq_files: Iterable[Path], tmpdir: Path, basename="sequences") -> Path: - """ - Create a MMseqs2 sequence database with the given fasta files. - - :param seq_files: An iterable of path of FASTA files. - :param tmpdir: Path to the temporary directory where the database will be created. - :param basename: Prefix for the database file (default: "sequences"). - - :return: Path to the created MMseqs2 database file. - """ - - with tempfile.NamedTemporaryFile(mode="w", dir=tmpdir, delete=False, suffix=".DB", prefix=basename) as seqdb: - cmd = ["mmseqs", "createdb"] + [seq_file.as_posix() for seq_file in seq_files] + [seqdb.name, '--dbtype', '0'] - - logging.getLogger("PPanGGOLiN").debug(" ".join(cmd)) - subprocess.run(cmd, stdout=subprocess.DEVNULL) - - return Path(seqdb.name) - - -def translate_with_mmseqs(seqdb: Path, translation_table: int, cpu: int, tmpdir: Path) -> Path: - """ - Translate nucleotide sequences in an MMseqs2 sequence database to amino acid sequences. - - :param seqdb: Path to the input MMseqs2 sequence database containing nucleotide sequences. - :param translation_table: The translation table to use for conversion. - :param cpu: Number of CPU cores to use for translation. - :param tmpdir: Path to the temporary directory for intermediate files. - - :return: Path to the new MMseqs2 sequence database containing translated amino acid sequences. - """ - - with tempfile.NamedTemporaryFile(mode="w", dir=tmpdir, delete=False, prefix=seqdb.stem, - suffix=".aa.DB") as seqdb_aa: - cmd = ["mmseqs", "translatenucs", seqdb.as_posix(), seqdb_aa.name, "--translation-table", - f"{translation_table}", "--threads", str(cpu)] - - logging.getLogger().debug(" ".join(cmd)) - subprocess.run(cmd, stdout=subprocess.DEVNULL, check=True) - - return Path(seqdb_aa.name) - - -def align_seq_to_pang(target_seq_file: Path, query_seq_files: Iterable[Path], +def align_seq_to_pang(target_seq_file: TextIO, query_seq_files: Path, tmpdir: Path, cpu: int = 1, no_defrag: bool = False, identity: float = 0.8, coverage: float = 0.8, is_query_nt: bool = False, is_target_nt: bool = False, translation_table: int = None) -> Path: @@ -85,21 +43,23 @@ def align_seq_to_pang(target_seq_file: Path, query_seq_files: Iterable[Path], :param is_target_nt: Is the sequences of pangenome (target) are nucleotide sequences. If True, sequences are translated by mmseqs :param translation_table: Translation table to use, if sequences are nucleotide and need to be translated. - :return: Alignement result file + :return: Alignment result file """ - target_db = create_mmseqs_db([target_seq_file], tmpdir, basename="target_sequences") - query_db = create_mmseqs_db(query_seq_files, tmpdir, basename="query_sequences") - if is_target_nt: - logging.getLogger().debug( - f"Target sequences will be translated by mmseqs with translation table {translation_table}") - target_db = translate_with_mmseqs(target_db, translation_table, cpu, tmpdir) + logging.getLogger().debug("Target sequences will be translated by mmseqs with " + f"translation table {translation_table}") + target_db = translate_genes(target_seq_file, 'target_db', tmpdir, cpu, translation_table) + else: + target_db = create_mmseqs_db(target_seq_file, 'target_db', tmpdir, db_mode=0, db_type=1) - if is_query_nt: - logging.getLogger().debug( - f"Query sequences will be translated by mmseqs with translation table {translation_table}") - query_db = translate_with_mmseqs(query_db, translation_table, cpu, tmpdir) + with open(query_seq_files, 'r') as query_sequences: + if is_query_nt: + logging.getLogger().debug("Query sequences will be translated by mmseqs " + f"with translation table {translation_table}") + query_db = translate_genes(query_sequences, 'query_db', tmpdir, cpu, translation_table) + else: + query_db = create_mmseqs_db(query_sequences, 'query_db', tmpdir, db_mode=0, db_type=1) cov_mode = "2" # coverage of query if no_defrag: @@ -141,11 +101,11 @@ def map_input_gene_to_family_all_aln(aln_res: Path, outdir: Path, Read alignment result to link input sequences to pangenome gene family. Alignment have been made against all genes of the pangenome. - :param aln_res: Alignement result file + :param aln_res: Alignment result file :param outdir: Output directory :param pangenome: Input pangenome - :return: Dictionnary with sequence link to pangenome gene families and actual path to the cleaned alignment file + :return: Dictionary with sequence link to pangenome gene families and actual path to the cleaned alignment file """ seq2pang = {} @@ -176,11 +136,11 @@ def map_input_gene_to_family_rep_aln(aln_res: Path, outdir: Path, Read alignment result to link input sequences to pangenome gene family. Alignment have been made against representative sequence of gene families of the pangenome. - :param aln_res: Alignement result file + :param aln_res: Alignment result file :param outdir: Output directory :param pangenome: Input pangenome - :return: Dictionnary with sequence link to pangenome gene families and actual path to the cleaned alignment file + :return: Dictionary with sequence link to pangenome gene families and actual path to the cleaned alignment file """ seq2pang = {} aln_file_clean = outdir / "alignment_input_seqs_to_pangenome_gene_families.tsv" # write the actual result file @@ -231,7 +191,7 @@ def get_seq_ids(seq_file: TextIOWrapper) -> Tuple[Set[str], bool]: return seq_set, is_nucleotide -def write_gene_fam_sequences(pangenome: Pangenome, file_obj: IO, add: str = "", disable_bar: bool = False): +def write_gene_fam_sequences(pangenome: Pangenome, file_obj: TextIO, add: str = "", disable_bar: bool = False): """ Export the sequence of gene families @@ -247,7 +207,7 @@ def write_gene_fam_sequences(pangenome: Pangenome, file_obj: IO, add: str = "", # file_obj.flush() -def write_all_gene_sequences(pangenome: Pangenome, file_obj: IO, add: str = "", disable_bar: bool = False): +def write_all_gene_sequences(pangenome: Pangenome, file_obj: TextIO, add: str = "", disable_bar: bool = False): """ Export the sequence of pangenome genes @@ -269,7 +229,7 @@ def project_and_write_partition(seqid_to_gene_family: Dict[str, GeneFamily], seq """ Project the partition of each sequence from the input file and write them in a file - :param seqid_to_gene_family: dictionnary which link sequence and pangenome gene family + :param seqid_to_gene_family: dictionary which link sequence and pangenome gene family :param seq_set: input sequences :param output: Path of the output directory @@ -289,7 +249,7 @@ def write_gene_to_gene_family(seqid_to_gene_family: Dict[str, GeneFamily], seq_s """ Write input gene to pangenome gene family. - :param seqid_to_gene_family: dictionnary which links input sequence and pangenome gene family + :param seqid_to_gene_family: dictionary which links input sequence and pangenome gene family :param seq_set: input sequences :param output: Path of the output directory @@ -317,7 +277,7 @@ def get_fam_to_rgp(pangenome, multigenics: set) -> dict: :param pangenome: Input pangenome :param multigenics: multigenics families - :return: Dictionnary link families to RGP + :return: Dictionary link families to RGP """ fam2rgp = defaultdict(list) for rgp in pangenome.regions: @@ -376,7 +336,7 @@ def draw_spot_gexf(spots: set, output: Path, multigenics: set, fam_to_mod: dict, :param spots: spot find in the alignment between pangenome and input sequences :param output: Path of the output directory :param multigenics: multigenics families - :param fam_to_mod: dictionnary which link families and modules + :param fam_to_mod: dictionary which link families and modules :param set_size: """ for spot in spots: @@ -433,7 +393,7 @@ def get_seq_info(seq_to_pang: dict, pangenome: Pangenome, output: Path, draw_rel f"{output / 'info_input_seq.tsv'}") -def get_input_seq_to_family_with_rep(pangenome: Pangenome, sequence_files: Iterable[Path], output: Path, +def get_input_seq_to_family_with_rep(pangenome: Pangenome, sequence_files: Path, output: Path, tmpdir: Path, is_input_seq_nt: bool, cpu: int = 1, no_defrag: bool = False, identity: float = 0.8, coverage: float = 0.8, translation_table: int = 11, disable_bar: bool = False) -> Tuple[Path, Dict[str, GeneFamily]]: @@ -465,7 +425,7 @@ def get_input_seq_to_family_with_rep(pangenome: Pangenome, sequence_files: Itera logging.getLogger().debug(f'Write gene family sequences in {tmp_pang_file.name}') write_gene_fam_sequences(pangenome, tmp_pang_file, add="ppanggolin_", disable_bar=disable_bar) - align_file = align_seq_to_pang(target_seq_file=Path(tmp_pang_file.name), query_seq_files=sequence_files, + align_file = align_seq_to_pang(target_seq_file=tmp_pang_file, query_seq_files=sequence_files, tmpdir=tmpdir, cpu=cpu, no_defrag=no_defrag, identity=identity, coverage=coverage, is_query_nt=is_input_seq_nt, is_target_nt=False, @@ -476,7 +436,7 @@ def get_input_seq_to_family_with_rep(pangenome: Pangenome, sequence_files: Itera return align_file, seq2pang -def get_input_seq_to_family_with_all(pangenome: Pangenome, sequence_files: Iterable[Path], output: Path, +def get_input_seq_to_family_with_all(pangenome: Pangenome, sequence_files: Path, output: Path, tmpdir: Path, is_input_seq_nt: bool, cpu: int = 1, no_defrag: bool = False, identity: float = 0.8, coverage: float = 0.8, translation_table: int = 11, disable_bar: bool = False) -> Tuple[Path, Dict[str, GeneFamily]]: @@ -507,7 +467,7 @@ def get_input_seq_to_family_with_all(pangenome: Pangenome, sequence_files: Itera logging.getLogger().debug(f'Write all pangenome gene sequences in {tmp_pang_file.name}') write_all_gene_sequences(pangenome, tmp_pang_file, add="ppanggolin_", disable_bar=disable_bar) - align_file = align_seq_to_pang(target_seq_file=Path(tmp_pang_file.name), query_seq_files=sequence_files, + align_file = align_seq_to_pang(target_seq_file=tmp_pang_file, query_seq_files=sequence_files, tmpdir=tmpdir, cpu=cpu, no_defrag=no_defrag, identity=identity, coverage=coverage, is_query_nt=is_input_seq_nt, is_target_nt=True, @@ -564,14 +524,14 @@ def align(pangenome: Pangenome, sequence_file: Path, output: Path, identity: flo with create_tmpdir(main_dir=tmpdir, basename="align_input_seq_tmpdir", keep_tmp=keep_tmp) as new_tmpdir: if use_representatives: - align_file, seq2pang = get_input_seq_to_family_with_rep(pangenome, [sequence_file], output, new_tmpdir, + align_file, seq2pang = get_input_seq_to_family_with_rep(pangenome, sequence_file, output, new_tmpdir, is_input_seq_nt=is_nucleotide, cpu=cpu, no_defrag=no_defrag, identity=identity, coverage=coverage, translation_table=translation_table, disable_bar=disable_bar) else: - align_file, seq2pang = get_input_seq_to_family_with_all(pangenome=pangenome, sequence_files=[sequence_file], + align_file, seq2pang = get_input_seq_to_family_with_all(pangenome=pangenome, sequence_files=sequence_file, output=output, tmpdir=new_tmpdir, is_input_seq_nt=is_nucleotide, cpu=cpu, no_defrag=no_defrag, identity=identity, diff --git a/ppanggolin/cluster/cluster.py b/ppanggolin/cluster/cluster.py index 69d69f61..1f192370 100644 --- a/ppanggolin/cluster/cluster.py +++ b/ppanggolin/cluster/cluster.py @@ -85,7 +85,7 @@ def first_clustering(sequences: TextIO, tmpdir: Path, cpu: int = 1, code: int = :return: path to representative sequence file and path to tsv clustering result """ - seqdb = translate_genes(sequences, tmpdir, cpu, code) + seqdb = translate_genes(sequences, 'aa_db', tmpdir, cpu, code) logging.getLogger("PPanGGOLiN").info("Clustering sequences...") cludb = tmpdir / 'cluster_db' cmd = list(map(str, ["mmseqs", "cluster", seqdb, cludb, tmpdir, "--cluster-mode", mode, "--min-seq-id", diff --git a/ppanggolin/formats/writeSequences.py b/ppanggolin/formats/writeSequences.py index b47b5581..92886427 100644 --- a/ppanggolin/formats/writeSequences.py +++ b/ppanggolin/formats/writeSequences.py @@ -134,7 +134,29 @@ def write_gene_sequences_from_annotations(genes_to_write: Iterable[Gene], file_o file_obj.flush() -def translate_genes(sequences: TextIO, tmpdir: Path, threads: int = 1, code: int = 11) -> Path: +def create_mmseqs_db(sequences: TextIO, db_name: str, tmpdir: Path, db_mode: int = 0, db_type: int = 0) -> Path: + """Create a MMseqs2 database from a sequences file. + + :param sequences: File with the sequences + :param tmpdir: Temporary directory to save the MMSeqs2 files + :param db_mode: Createdb mode 0: copy data, 1: soft link data and write new index (works only with single line fasta/q) + :param db_type: Database type 0: auto, 1: amino acid 2: nucleotides + + :return: Path to the MMSeqs2 database + """ + assert db_mode in [0, 1], f"Createdb mode must be 0 or 1, given {db_mode}" + assert db_type in [0, 1, 2], f"dbtype must be 0, 1 or 2, given {db_type}" + + seq_nucdb = tmpdir / db_name + cmd = list(map(str, ["mmseqs", "createdb", "--createdb-mode", db_mode, + "--dbtype", db_type, sequences.name, seq_nucdb])) + logging.getLogger("PPanGGOLiN").debug(" ".join(cmd)) + logging.getLogger("PPanGGOLiN").info("Creating sequence database...") + subprocess.run(cmd, stdout=subprocess.DEVNULL, check=True) + return seq_nucdb + + +def translate_genes(sequences: TextIO, db_name: str, tmpdir: Path, threads: int = 1, code: int = 11) -> Path: """Translate nucleotide sequences into MMSeqs2 amino acid sequences database :param sequences: File with the nucleotide sequences @@ -144,13 +166,9 @@ def translate_genes(sequences: TextIO, tmpdir: Path, threads: int = 1, code: int :return: Path to the MMSeqs2 database """ - seq_nucdb = tmpdir / 'nucleotide_sequences_db' - cmd = list(map(str, ["mmseqs", "createdb", "--createdb-mode", 1, sequences.name, seq_nucdb])) - logging.getLogger("PPanGGOLiN").debug(" ".join(cmd)) - logging.getLogger("PPanGGOLiN").info("Creating sequence database...") - subprocess.run(cmd, stdout=subprocess.DEVNULL, check=True) + seq_nucdb = create_mmseqs_db(sequences, 'nucleotide_sequences_db', tmpdir) logging.getLogger("PPanGGOLiN").debug("Translate sequence ...") - seqdb = tmpdir / 'aa_db' + seqdb = tmpdir / db_name cmd = list(map(str, ["mmseqs", "translatenucs", seq_nucdb, seqdb, "--threads", threads, "--translation-table", code])) logging.getLogger("PPanGGOLiN").debug(" ".join(cmd)) subprocess.run(cmd, stdout=subprocess.DEVNULL, check=True) @@ -222,7 +240,7 @@ def write_gene_protein_sequences(pangenome: Pangenome, output: Path, genes_prot: write_gene_sequences(pangenome, tmpdir, genes_prot, soft_core, compress, disable_bar) with open(tmpdir / f"{genes_prot}_genes.fna") as sequences: - translate_db = translate_genes(sequences, tmpdir, threads, code) + translate_db = translate_genes(sequences, 'aa_db', tmpdir, threads, code) outpath = output / f"{genes_prot}_protein_genes.fna" cmd = list(map(str, ["mmseqs", "convert2fasta", translate_db, outpath])) logging.getLogger("PPanGGOLiN").debug(" ".join(cmd)) @@ -619,7 +637,7 @@ def parser_seq(parser: argparse.ArgumentParser): ) onereq.add_argument("--genes", required=False, type=filter_values, help=f"Write all nucleotide CDS sequences. {poss_values_log}") - onereq.add_argument("--genes_prot", required=False, type=filter_values, + onereq.add_argument("--proteins", required=False, type=filter_values, help=f"Write representative amino acid sequences of genes. {poss_values_log}") onereq.add_argument("--prot_families", required=False, type=filter_values, help=f"Write representative amino acid sequences of gene families. {poss_values_log}") From 4cb53d158f8148447f4aa91f4f49d10a3cb3f559 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= Date: Tue, 4 Jun 2024 16:28:27 +0200 Subject: [PATCH 13/25] Add a soft link in mmseqs createdb --- ppanggolin/align/alignOnPang.py | 103 +++++++++++++++------------ ppanggolin/cluster/cluster.py | 2 +- ppanggolin/formats/writeSequences.py | 13 ++-- 3 files changed, 69 insertions(+), 49 deletions(-) diff --git a/ppanggolin/align/alignOnPang.py b/ppanggolin/align/alignOnPang.py index c4c7bc56..a59739d8 100644 --- a/ppanggolin/align/alignOnPang.py +++ b/ppanggolin/align/alignOnPang.py @@ -9,7 +9,7 @@ import subprocess import argparse from collections import defaultdict, Counter -from typing import List, Tuple, Set, Dict, TextIO, Iterable +from typing import List, Tuple, Set, Dict, TextIO from pathlib import Path from tqdm import tqdm @@ -25,10 +25,10 @@ from ppanggolin.formats.writeSequences import translate_genes, create_mmseqs_db -def align_seq_to_pang(target_seq_file: TextIO, query_seq_files: Path, - tmpdir: Path, cpu: int = 1, no_defrag: bool = False, - identity: float = 0.8, coverage: float = 0.8, - is_query_nt: bool = False, is_target_nt: bool = False, translation_table: int = None) -> Path: +def align_seq_to_pang(target_seq_file: TextIO, query_seq_files: Path, tmpdir: Path, cpu: int = 1, + no_defrag: bool = False, identity: float = 0.8, coverage: float = 0.8, + is_query_nt: bool = False, is_query_slf: bool = False, is_target_nt: bool = False, + is_target_slf: bool = False, translation_table: int = None) -> Path: """ Align fasta sequence to pangenome sequences. @@ -40,26 +40,30 @@ def align_seq_to_pang(target_seq_file: TextIO, query_seq_files: Path, :param identity: minimal identity threshold for the alignment :param coverage: minimal identity threshold for the alignment :param is_query_nt: Is the sequence file (query) are nucleotide sequences. If True, sequences are translated by mmseqs + :param is_query_slf: Is the sequence file (query) with single line fasta. If True, MMSeqs2 database will be with soft link :param is_target_nt: Is the sequences of pangenome (target) are nucleotide sequences. If True, sequences are translated by mmseqs + :param is_target_slf: Is the sequences of pangenome (target) with single line fasta. If True, MMSeqs2 database will be with soft link :param translation_table: Translation table to use, if sequences are nucleotide and need to be translated. :return: Alignment result file """ if is_target_nt: - logging.getLogger().debug("Target sequences will be translated by mmseqs with " - f"translation table {translation_table}") - target_db = translate_genes(target_seq_file, 'target_db', tmpdir, cpu, translation_table) + logging.getLogger("PPanGGOLiN").debug("Target sequences will be translated by mmseqs with " + f"translation table {translation_table}") + target_db = translate_genes(target_seq_file, 'target_db', tmpdir, cpu, is_target_slf, translation_table) else: - target_db = create_mmseqs_db(target_seq_file, 'target_db', tmpdir, db_mode=0, db_type=1) + target_db = create_mmseqs_db(target_seq_file, 'target_db', tmpdir, + db_mode=1 if is_target_slf else 0, db_type=1) with open(query_seq_files, 'r') as query_sequences: if is_query_nt: - logging.getLogger().debug("Query sequences will be translated by mmseqs " - f"with translation table {translation_table}") - query_db = translate_genes(query_sequences, 'query_db', tmpdir, cpu, translation_table) + logging.getLogger("PPanGGOLiN").debug("Query sequences will be translated by mmseqs " + f"with translation table {translation_table}") + query_db = translate_genes(query_sequences, 'query_db', tmpdir, cpu, is_query_slf, translation_table) else: - query_db = create_mmseqs_db(query_sequences, 'query_db', tmpdir, db_mode=0, db_type=1) + query_db = create_mmseqs_db(query_sequences, 'query_db', tmpdir, + db_mode=1 if is_query_slf else 0, db_type=1) cov_mode = "2" # coverage of query if no_defrag: @@ -75,21 +79,21 @@ def align_seq_to_pang(target_seq_file: TextIO, query_seq_files: Path, "-c", str(coverage), "--cov-mode", cov_mode, "--threads", str(cpu), "--seed-sub-mat", "VTML40.out", "-s", "2", '--comp-bias-corr', "0", "--mask", "0", "-e", "1"] - logging.getLogger().info("Aligning sequences") - logging.getLogger().debug(" ".join(cmd)) + logging.getLogger("PPanGGOLiN").info("Aligning sequences") + logging.getLogger("PPanGGOLiN").debug(" ".join(cmd)) start = time.time() subprocess.run(cmd, stdout=subprocess.DEVNULL, check=True) align_time = time.time() - start - logging.getLogger().info(f"Done aligning sequences in {round(align_time, 2)} seconds") + logging.getLogger("PPanGGOLiN").info(f"Done aligning sequences in {round(align_time, 2)} seconds") with tempfile.NamedTemporaryFile(mode="w", dir=tmpdir, prefix="aln_result_db_file", suffix=".tsv", delete=False) as outfile: cmd = ["mmseqs", "convertalis", query_db.as_posix(), target_db.as_posix(), aln_db.name, outfile.name, "--format-mode", "2"] - logging.getLogger().info("Extracting alignments...") - logging.getLogger().debug(" ".join(cmd)) + logging.getLogger("PPanGGOLiN").info("Extracting alignments...") + logging.getLogger("PPanGGOLiN").debug(" ".join(cmd)) subprocess.run(cmd, stdout=subprocess.DEVNULL, check=True) return Path(outfile.name) @@ -110,7 +114,7 @@ def map_input_gene_to_family_all_aln(aln_res: Path, outdir: Path, seq2pang = {} aln_file_clean = outdir / "alignment_input_seqs_to_all_pangenome_genes.tsv" # write the actual result file - logging.getLogger().debug(f'Writing alignment file in {aln_file_clean}') + logging.getLogger("PPanGGOLiN").debug(f'Writing alignment file in {aln_file_clean}') with open(aln_res, "r") as alnFile, open(aln_file_clean, "w") as aln_outfl: for line in alnFile: @@ -145,7 +149,7 @@ def map_input_gene_to_family_rep_aln(aln_res: Path, outdir: Path, seq2pang = {} aln_file_clean = outdir / "alignment_input_seqs_to_pangenome_gene_families.tsv" # write the actual result file - logging.getLogger().debug(f'Writing alignment file in {aln_file_clean}') + logging.getLogger("PPanGGOLiN").debug(f'Writing alignment file in {aln_file_clean}') with open(aln_res, "r") as alnFile, open(aln_file_clean, "w") as aln_outfl: for line in alnFile: @@ -165,7 +169,7 @@ def map_input_gene_to_family_rep_aln(aln_res: Path, outdir: Path, return seq2pang, aln_file_clean -def get_seq_ids(seq_file: TextIOWrapper) -> Tuple[Set[str], bool]: +def get_seq_ids(seq_file: TextIOWrapper) -> Tuple[Set[str], bool, bool]: """ Get sequence IDs from a sequence input file in FASTA format and guess the sequence type based on the first sequences. @@ -177,18 +181,23 @@ def get_seq_ids(seq_file: TextIOWrapper) -> Tuple[Set[str], bool]: seq_set = set() seq_count = 0 first_seq_concat = "" - + single_line_fasta = False + count_fasta_line = 0 for line in seq_file: if line.startswith(">"): seq_set.add(line[1:].split()[0].strip()) seq_count += 1 + if count_fasta_line > 1: # Allow to know if we can use soft link with createdb from MMSeqs2 + single_line_fasta = True + count_fasta_line = 0 elif seq_count <= 20: first_seq_concat += line.strip() + count_fasta_line += 1 char_counter = Counter(first_seq_concat) is_nucleotide = all(char in dna_expected_char for char in char_counter) - return seq_set, is_nucleotide + return seq_set, is_nucleotide, single_line_fasta def write_gene_fam_sequences(pangenome: Pangenome, file_obj: TextIO, add: str = "", disable_bar: bool = False): @@ -394,7 +403,8 @@ def get_seq_info(seq_to_pang: dict, pangenome: Pangenome, output: Path, draw_rel def get_input_seq_to_family_with_rep(pangenome: Pangenome, sequence_files: Path, output: Path, - tmpdir: Path, is_input_seq_nt: bool, cpu: int = 1, no_defrag: bool = False, + tmpdir: Path, is_input_seq_nt: bool, is_input_slf: bool = False, + cpu: int = 1, no_defrag: bool = False, identity: float = 0.8, coverage: float = 0.8, translation_table: int = 11, disable_bar: bool = False) -> Tuple[Path, Dict[str, GeneFamily]]: """ @@ -407,7 +417,8 @@ def get_input_seq_to_family_with_rep(pangenome: Pangenome, sequence_files: Path, :param sequence_files: Iterable of paths of FASTA files containing input sequences to align. :param output: Path to the output directory where alignment results will be stored. :param tmpdir: Temporary directory for intermediate files. - :param is_input_seq_nt: Is input sequence file nucleotide sequences. + :param is_input_seq_nt: Is input sequence file nucleotide sequences. + :param is_input_slf: Is the sequence file with single line fasta. If True, MMSeqs2 database will be with soft link :param cpu: Number of CPU cores to use for the alignment (default: 1). :param no_defrag: If True, the defragmentation workflow is skipped (default: False). :param identity: Minimum identity threshold for the alignment (default: 0.8). @@ -422,23 +433,24 @@ def get_input_seq_to_family_with_rep(pangenome: Pangenome, sequence_files: Path, # delete False to be able to keep tmp file. If they are not keep tmpdir will be destroyed so no need to delete tmpfile with tempfile.NamedTemporaryFile(mode="w", dir=tmpdir.as_posix(), delete=False, prefix="representative_genes", suffix=".faa") as tmp_pang_file: - logging.getLogger().debug(f'Write gene family sequences in {tmp_pang_file.name}') + logging.getLogger("PPanGGOLiN").debug(f'Write gene family sequences in {tmp_pang_file.name}') write_gene_fam_sequences(pangenome, tmp_pang_file, add="ppanggolin_", disable_bar=disable_bar) align_file = align_seq_to_pang(target_seq_file=tmp_pang_file, query_seq_files=sequence_files, tmpdir=tmpdir, cpu=cpu, no_defrag=no_defrag, identity=identity, coverage=coverage, - is_query_nt=is_input_seq_nt, is_target_nt=False, - translation_table=translation_table) + is_query_nt=is_input_seq_nt, is_query_slf=is_input_slf, + is_target_nt=False, is_target_slf=False, translation_table=translation_table) seq2pang, align_file = map_input_gene_to_family_rep_aln(align_file, output, pangenome) return align_file, seq2pang -def get_input_seq_to_family_with_all(pangenome: Pangenome, sequence_files: Path, output: Path, - tmpdir: Path, is_input_seq_nt: bool, cpu: int = 1, no_defrag: bool = False, - identity: float = 0.8, coverage: float = 0.8, translation_table: int = 11, +def get_input_seq_to_family_with_all(pangenome: Pangenome, sequence_files: Path, output: Path, tmpdir: Path, + is_input_seq_nt: bool, is_input_slf: bool = False, + cpu: int = 1, no_defrag: bool = False, identity: float = 0.8, + coverage: float = 0.8, translation_table: int = 11, disable_bar: bool = False) -> Tuple[Path, Dict[str, GeneFamily]]: """ Assign gene families from a pangenome to input sequences. @@ -450,7 +462,8 @@ def get_input_seq_to_family_with_all(pangenome: Pangenome, sequence_files: Path, :param sequence_files: Iterable of paths of FASTA files containing input sequences to align. :param output: Path to the output directory where alignment results will be stored. :param tmpdir: Temporary directory for intermediate files. - :param is_input_seq_nt: Is input sequence file nucleotide sequences. + :param is_input_seq_nt: Is input sequence file nucleotide sequences. + :param is_input_slf: Is the sequence file with single line fasta. If True, MMSeqs2 database will be with soft link :param cpu: Number of CPU cores to use for the alignment (default: 1). :param no_defrag: If True, the defragmentation workflow is skipped (default: False). :param identity: Minimum identity threshold for the alignment (default: 0.8). @@ -464,14 +477,13 @@ def get_input_seq_to_family_with_all(pangenome: Pangenome, sequence_files: Path, with tempfile.NamedTemporaryFile(mode="w", dir=tmpdir.as_posix(), delete=False, prefix="all_pangenome_genes", suffix=".fna") as tmp_pang_file: - logging.getLogger().debug(f'Write all pangenome gene sequences in {tmp_pang_file.name}') + logging.getLogger("PPanGGOLiN").debug(f'Write all pangenome gene sequences in {tmp_pang_file.name}') write_all_gene_sequences(pangenome, tmp_pang_file, add="ppanggolin_", disable_bar=disable_bar) - align_file = align_seq_to_pang(target_seq_file=tmp_pang_file, query_seq_files=sequence_files, - tmpdir=tmpdir, cpu=cpu, - no_defrag=no_defrag, identity=identity, coverage=coverage, - is_query_nt=is_input_seq_nt, is_target_nt=True, - translation_table=translation_table) + align_file = align_seq_to_pang(target_seq_file=tmp_pang_file, query_seq_files=sequence_files, tmpdir=tmpdir, + cpu=cpu, no_defrag=no_defrag, identity=identity, coverage=coverage, + is_query_nt=is_input_seq_nt, is_query_slf=is_input_slf, is_target_nt=True, + is_target_slf=True, translation_table=translation_table) seq2pang, align_file = map_input_gene_to_family_all_aln(align_file, output, pangenome) @@ -480,7 +492,7 @@ def get_input_seq_to_family_with_all(pangenome: Pangenome, sequence_files: Path, def align(pangenome: Pangenome, sequence_file: Path, output: Path, identity: float = 0.8, coverage: float = 0.8, no_defrag: bool = False, cpu: int = 1, getinfo: bool = False, - use_representatives: bool = False, draw_related: bool = False, translation_table: int = 11, + use_representatives: bool = False, draw_related: bool = False, translation_table: int = 11, tmpdir: Path = None, disable_bar: bool = False, keep_tmp=False): """ Aligns pangenome sequences with sequences in a FASTA file using MMSeqs2. @@ -519,13 +531,14 @@ def align(pangenome: Pangenome, sequence_file: Path, output: Path, identity: flo check_pangenome_info(pangenome, need_families=True, disable_bar=disable_bar) with read_compressed_or_not(sequence_file) as seqFileObj: - seq_set, is_nucleotide = get_seq_ids(seqFileObj) + seq_set, is_nucleotide, single_line_fasta = get_seq_ids(seqFileObj) with create_tmpdir(main_dir=tmpdir, basename="align_input_seq_tmpdir", keep_tmp=keep_tmp) as new_tmpdir: if use_representatives: align_file, seq2pang = get_input_seq_to_family_with_rep(pangenome, sequence_file, output, new_tmpdir, is_input_seq_nt=is_nucleotide, + is_input_slf=single_line_fasta, cpu=cpu, no_defrag=no_defrag, identity=identity, coverage=coverage, translation_table=translation_table, @@ -534,6 +547,7 @@ def align(pangenome: Pangenome, sequence_file: Path, output: Path, identity: flo align_file, seq2pang = get_input_seq_to_family_with_all(pangenome=pangenome, sequence_files=sequence_file, output=output, tmpdir=new_tmpdir, is_input_seq_nt=is_nucleotide, + is_input_slf=single_line_fasta, cpu=cpu, no_defrag=no_defrag, identity=identity, coverage=coverage, translation_table=translation_table, @@ -543,9 +557,10 @@ def align(pangenome: Pangenome, sequence_file: Path, output: Path, identity: flo get_seq_info(seq2pang, pangenome, output, draw_related, disable_bar=disable_bar) part_proj = project_and_write_partition(seq2pang, seq_set, output) # write the partition assignation only - logging.getLogger().info(f"sequences partition projection : '{part_proj}'") - logging.getLogger().info(f"{len(seq2pang)} sequences over {len(seq_set)} have at least one hit in the pangenome.") - logging.getLogger().info(f"Blast-tab file of the alignment : '{align_file}'") + logging.getLogger("PPanGGOLiN").info(f"sequences partition projection : '{part_proj}'") + logging.getLogger("PPanGGOLiN").info( + f"{len(seq2pang)} sequences over {len(seq_set)} have at least one hit in the pangenome.") + logging.getLogger("PPanGGOLiN").info(f"Blast-tab file of the alignment : '{align_file}'") def launch(args: argparse.Namespace): @@ -617,7 +632,7 @@ def parser_align(parser: argparse.ArgumentParser): help="In the context of provided annotation, use this option to read pseudogenes. " "(Default behavior is to ignore them)") optional.add_argument("-c", "--cpu", required=False, default=1, type=int, help="Number of available cpus") - optional.add_argument("--tmpdir", required=False, type=str, default=Path(tempfile.gettempdir()), + optional.add_argument("--tmpdir", required=False, type=Path, default=Path(tempfile.gettempdir()), help="directory for storing temporary files") optional.add_argument("--keep_tmp", required=False, default=False, action="store_true", help="Keeping temporary files (useful for debugging).") diff --git a/ppanggolin/cluster/cluster.py b/ppanggolin/cluster/cluster.py index 1f192370..85d99ebc 100644 --- a/ppanggolin/cluster/cluster.py +++ b/ppanggolin/cluster/cluster.py @@ -85,7 +85,7 @@ def first_clustering(sequences: TextIO, tmpdir: Path, cpu: int = 1, code: int = :return: path to representative sequence file and path to tsv clustering result """ - seqdb = translate_genes(sequences, 'aa_db', tmpdir, cpu, code) + seqdb = translate_genes(sequences, 'aa_db', tmpdir, cpu, True, code) logging.getLogger("PPanGGOLiN").info("Clustering sequences...") cludb = tmpdir / 'cluster_db' cmd = list(map(str, ["mmseqs", "cluster", seqdb, cludb, tmpdir, "--cluster-mode", mode, "--min-seq-id", diff --git a/ppanggolin/formats/writeSequences.py b/ppanggolin/formats/writeSequences.py index 92886427..24d1ab76 100644 --- a/ppanggolin/formats/writeSequences.py +++ b/ppanggolin/formats/writeSequences.py @@ -138,6 +138,7 @@ def create_mmseqs_db(sequences: TextIO, db_name: str, tmpdir: Path, db_mode: int """Create a MMseqs2 database from a sequences file. :param sequences: File with the sequences + :param db_name: name of the database :param tmpdir: Temporary directory to save the MMSeqs2 files :param db_mode: Createdb mode 0: copy data, 1: soft link data and write new index (works only with single line fasta/q) :param db_type: Database type 0: auto, 1: amino acid 2: nucleotides @@ -156,17 +157,21 @@ def create_mmseqs_db(sequences: TextIO, db_name: str, tmpdir: Path, db_mode: int return seq_nucdb -def translate_genes(sequences: TextIO, db_name: str, tmpdir: Path, threads: int = 1, code: int = 11) -> Path: +def translate_genes(sequences: TextIO, db_name: str, tmpdir: Path, threads: int = 1, + is_single_line_fasta: bool = False, code: int = 11) -> Path: """Translate nucleotide sequences into MMSeqs2 amino acid sequences database :param sequences: File with the nucleotide sequences + :param db_name: name of the output database :param tmpdir: Temporary directory to save the MMSeqs2 files :param threads: Number of available threads to use + :param is_single_line_fasta: Allow to use soft link in MMSeqs2 database :param code: Translation code to use :return: Path to the MMSeqs2 database """ - seq_nucdb = create_mmseqs_db(sequences, 'nucleotide_sequences_db', tmpdir) + seq_nucdb = create_mmseqs_db(sequences, 'nucleotide_sequences_db', tmpdir, + db_mode=1 if is_single_line_fasta else 0, db_type=2) logging.getLogger("PPanGGOLiN").debug("Translate sequence ...") seqdb = tmpdir / db_name cmd = list(map(str, ["mmseqs", "translatenucs", seq_nucdb, seqdb, "--threads", threads, "--translation-table", code])) @@ -196,7 +201,7 @@ def write_gene_sequences(pangenome: Pangenome, output: Path, genes: str, soft_co genefams = set() genes_to_write = [] if genes == "rgp": - logging.getLogger("PPanGGOLiN").info(f"Writing the gene nucleotide sequences in RGPs...") + logging.getLogger("PPanGGOLiN").info("Writing the gene nucleotide sequences in RGPs...") for region in pangenome.regions: genes_to_write.extend(region.genes) else: @@ -240,7 +245,7 @@ def write_gene_protein_sequences(pangenome: Pangenome, output: Path, genes_prot: write_gene_sequences(pangenome, tmpdir, genes_prot, soft_core, compress, disable_bar) with open(tmpdir / f"{genes_prot}_genes.fna") as sequences: - translate_db = translate_genes(sequences, 'aa_db', tmpdir, threads, code) + translate_db = translate_genes(sequences, 'aa_db', tmpdir, threads, True, code) outpath = output / f"{genes_prot}_protein_genes.fna" cmd = list(map(str, ["mmseqs", "convert2fasta", translate_db, outpath])) logging.getLogger("PPanGGOLiN").debug(" ".join(cmd)) From 326403f3a4178960bb7be5a052bc4667069f82da Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= Date: Tue, 4 Jun 2024 16:36:35 +0200 Subject: [PATCH 14/25] Replace add_spot_str function by Spot class method __str__ --- ppanggolin/align/alignOnPang.py | 16 ++-------------- 1 file changed, 2 insertions(+), 14 deletions(-) diff --git a/ppanggolin/align/alignOnPang.py b/ppanggolin/align/alignOnPang.py index a59739d8..9eef70a6 100644 --- a/ppanggolin/align/alignOnPang.py +++ b/ppanggolin/align/alignOnPang.py @@ -326,18 +326,6 @@ def get_fam_to_spot(pangenome: Pangenome, multigenics: Set[GeneFamily]) \ return fam2spot, fam2border -def add_spot_str(spot: Spot) -> str: - # TODO define as self.__str__ in spot - """ - allow to map spot set - - :param spot: spot which will be return - - :return: Str with spot ID - """ - return "spot_" + str(spot.ID) - - def draw_spot_gexf(spots: set, output: Path, multigenics: set, fam_to_mod: dict, set_size: int = 3): """ Draw a gexf graph of the spot @@ -374,8 +362,8 @@ def get_seq_info(seq_to_pang: dict, pangenome: Pangenome, output: Path, draw_rel spot_list = set() for seq, panfam in seq_to_pang.items(): finfo.write(seq + '\t' + panfam.name + "\t" + panfam.named_partition + "\t" + ",".join( - map(add_spot_str, fam2spot[panfam])) + "\t" + ",".join( - map(add_spot_str, fam2border[panfam])) + "\t" + ','.join(fam2rgp[panfam]) + "\n") + map(str, fam2spot[panfam])) + "\t" + ",".join( + map(str, fam2border[panfam])) + "\t" + ','.join(fam2rgp[panfam]) + "\n") spot_list |= set(fam2spot[panfam]) spot_list |= set(fam2border[panfam]) finfo.close() From 7e09b19f38b19c8b58183435e76964f429efa934 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= Date: Wed, 5 Jun 2024 10:16:34 +0200 Subject: [PATCH 15/25] Fix bug in projection and refactoring --- .github/workflows/main.yml | 4 +- docs/user/writeFasta.md | 2 +- ppanggolin/align/alignOnPang.py | 53 +++++++++++++------------ ppanggolin/cluster/cluster.py | 7 ++-- ppanggolin/context/searchGeneContext.py | 11 +++-- ppanggolin/figures/draw_spot.py | 21 +++++----- ppanggolin/formats/writeSequences.py | 42 ++++++++++---------- ppanggolin/projection/projection.py | 8 ++-- 8 files changed, 76 insertions(+), 72 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index cb5b0f47..94e5f44f 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -89,7 +89,7 @@ jobs: ppanggolin rarefaction --output stepbystep -f -p stepbystep/pangenome.h5 --depth 5 --min 1 --max 50 -ms 10 -fd -ck 30 -K 3 --soft_core 0.9 -se $RANDOM ppanggolin draw -p stepbystep/pangenome.h5 --tile_plot --nocloud --soft_core 0.92 --ucurve --output stepbystep -f ppanggolin rgp -p stepbystep/pangenome.h5 --persistent_penalty 2 --variable_gain 1 --min_score 3 --dup_margin 0.05 - ppanggolin spot -p stepbystep/pangenome.h5 --spot_graph --overlapping_match 2 --set_size 3 --exact_match_size 1 + ppanggolin spot -p stepbystep/pangenome.h5 --output stepbystep --spot_graph --overlapping_match 2 --set_size 3 --exact_match_size 1 -f ppanggolin draw -p stepbystep/pangenome.h5 --draw_spots -o stepbystep -f ppanggolin module -p stepbystep/pangenome.h5 --transitive 4 --size 3 --jaccard 0.86 --dup_margin 0.05 ppanggolin write_pangenome -p stepbystep/pangenome.h5 --output stepbystep -f --soft_core 0.9 --dup_margin 0.06 --gexf --light_gexf --csv --Rtab --stats --partitions --compress --json --spots --regions --borders --families_tsv --cpu 1 @@ -100,7 +100,7 @@ jobs: ppanggolin fasta -p stepbystep/pangenome.h5 --output stepbystep -f --prot_families module_0 ppanggolin fasta -p stepbystep/pangenome.h5 --output stepbystep -f --prot_families core ppanggolin fasta -p stepbystep/pangenome.h5 --output stepbystep -f --gene_families module_0 --genes module_0 - ppanggolin fasta -p stepbystep/pangenome.h5 --output stepbystep -f --genes_prot cloud --threads 1 --keep_tmp + ppanggolin fasta -p stepbystep/pangenome.h5 --output stepbystep -f --proteins cloud --threads 1 --keep_tmp ppanggolin draw -p stepbystep/pangenome.h5 --draw_spots --spots all -o stepbystep -f ppanggolin metrics -p stepbystep/pangenome.h5 --genome_fluidity --no_print_info --recompute_metrics --log metrics.log diff --git a/docs/user/writeFasta.md b/docs/user/writeFasta.md index e0ba5013..843f0788 100644 --- a/docs/user/writeFasta.md +++ b/docs/user/writeFasta.md @@ -37,7 +37,7 @@ ppanggolin fasta -p pangenome.h5 --output MY_GENES --genes persistent This option can be used to write the amino acid CDS sequences. It can be used as such, to write all the genes of the pangenome for example: ```bash -ppanggolin fasta -p pangenome.h5 --output MY_GENES --genes_prot all +ppanggolin fasta -p pangenome.h5 --output MY_GENES --proteins all ``` Or to write only the cloud genes: diff --git a/ppanggolin/align/alignOnPang.py b/ppanggolin/align/alignOnPang.py index 9eef70a6..0742a876 100644 --- a/ppanggolin/align/alignOnPang.py +++ b/ppanggolin/align/alignOnPang.py @@ -9,7 +9,7 @@ import subprocess import argparse from collections import defaultdict, Counter -from typing import List, Tuple, Set, Dict, TextIO +from typing import List, Tuple, Set, Dict, TextIO, Union, Iterable, Any from pathlib import Path from tqdm import tqdm @@ -25,7 +25,7 @@ from ppanggolin.formats.writeSequences import translate_genes, create_mmseqs_db -def align_seq_to_pang(target_seq_file: TextIO, query_seq_files: Path, tmpdir: Path, cpu: int = 1, +def align_seq_to_pang(target_seq_file: Union[Path, Iterable[Path]], query_seq_files: Union[Path, Iterable[Path]], tmpdir: Path, cpu: int = 1, no_defrag: bool = False, identity: float = 0.8, coverage: float = 0.8, is_query_nt: bool = False, is_query_slf: bool = False, is_target_nt: bool = False, is_target_slf: bool = False, translation_table: int = None) -> Path: @@ -53,17 +53,16 @@ def align_seq_to_pang(target_seq_file: TextIO, query_seq_files: Path, tmpdir: Pa f"translation table {translation_table}") target_db = translate_genes(target_seq_file, 'target_db', tmpdir, cpu, is_target_slf, translation_table) else: - target_db = create_mmseqs_db(target_seq_file, 'target_db', tmpdir, - db_mode=1 if is_target_slf else 0, db_type=1) - - with open(query_seq_files, 'r') as query_sequences: - if is_query_nt: - logging.getLogger("PPanGGOLiN").debug("Query sequences will be translated by mmseqs " - f"with translation table {translation_table}") - query_db = translate_genes(query_sequences, 'query_db', tmpdir, cpu, is_query_slf, translation_table) - else: - query_db = create_mmseqs_db(query_sequences, 'query_db', tmpdir, - db_mode=1 if is_query_slf else 0, db_type=1) + target_db = create_mmseqs_db([target_seq_file] if isinstance(target_seq_file, Path) else target_seq_file, + 'target_db', tmpdir, db_mode=1 if is_target_slf else 0, db_type=1) + + if is_query_nt: + logging.getLogger("PPanGGOLiN").debug("Query sequences will be translated by mmseqs " + f"with translation table {translation_table}") + query_db = translate_genes(query_seq_files, 'query_db', tmpdir, cpu, is_query_slf, translation_table) + else: + query_db = create_mmseqs_db([query_seq_files] if isinstance(query_seq_files, Path) else query_seq_files, + 'query_db', tmpdir, db_mode=1 if is_query_slf else 0, db_type=1) cov_mode = "2" # coverage of query if no_defrag: @@ -135,7 +134,7 @@ def map_input_gene_to_family_all_aln(aln_res: Path, outdir: Path, def map_input_gene_to_family_rep_aln(aln_res: Path, outdir: Path, - pangenome: Pangenome) -> Tuple[Dict[str, GeneFamily], str]: + pangenome: Pangenome) -> Tuple[Dict[Any, GeneFamily], Path]: """ Read alignment result to link input sequences to pangenome gene family. Alignment have been made against representative sequence of gene families of the pangenome. @@ -390,11 +389,11 @@ def get_seq_info(seq_to_pang: dict, pangenome: Pangenome, output: Path, draw_rel f"{output / 'info_input_seq.tsv'}") -def get_input_seq_to_family_with_rep(pangenome: Pangenome, sequence_files: Path, output: Path, - tmpdir: Path, is_input_seq_nt: bool, is_input_slf: bool = False, - cpu: int = 1, no_defrag: bool = False, - identity: float = 0.8, coverage: float = 0.8, translation_table: int = 11, - disable_bar: bool = False) -> Tuple[Path, Dict[str, GeneFamily]]: +def get_input_seq_to_family_with_rep(pangenome: Pangenome, sequence_files: Union[Path, Iterable[Path]], output: Path, + tmpdir: Path, is_input_seq_nt: bool, is_input_slf: bool = False, cpu: int = 1, + no_defrag: bool = False, identity: float = 0.8, coverage: float = 0.8, + translation_table: int = 11, disable_bar: bool = False + ) -> Tuple[Path, Dict[str, GeneFamily]]: """ Assign gene families from a pangenome to input sequences. @@ -422,9 +421,10 @@ def get_input_seq_to_family_with_rep(pangenome: Pangenome, sequence_files: Path, with tempfile.NamedTemporaryFile(mode="w", dir=tmpdir.as_posix(), delete=False, prefix="representative_genes", suffix=".faa") as tmp_pang_file: logging.getLogger("PPanGGOLiN").debug(f'Write gene family sequences in {tmp_pang_file.name}') + pangenome_sequences = Path(tmp_pang_file.name) write_gene_fam_sequences(pangenome, tmp_pang_file, add="ppanggolin_", disable_bar=disable_bar) - align_file = align_seq_to_pang(target_seq_file=tmp_pang_file, query_seq_files=sequence_files, + align_file = align_seq_to_pang(target_seq_file=pangenome_sequences, query_seq_files=sequence_files, tmpdir=tmpdir, cpu=cpu, no_defrag=no_defrag, identity=identity, coverage=coverage, is_query_nt=is_input_seq_nt, is_query_slf=is_input_slf, @@ -435,11 +435,11 @@ def get_input_seq_to_family_with_rep(pangenome: Pangenome, sequence_files: Path, return align_file, seq2pang -def get_input_seq_to_family_with_all(pangenome: Pangenome, sequence_files: Path, output: Path, tmpdir: Path, - is_input_seq_nt: bool, is_input_slf: bool = False, - cpu: int = 1, no_defrag: bool = False, identity: float = 0.8, - coverage: float = 0.8, translation_table: int = 11, - disable_bar: bool = False) -> Tuple[Path, Dict[str, GeneFamily]]: +def get_input_seq_to_family_with_all(pangenome: Pangenome, sequence_files: Union[Path, Iterable[Path]], output: Path, + tmpdir: Path, is_input_seq_nt: bool, is_input_slf: bool = False, cpu: int = 1, + no_defrag: bool = False, identity: float = 0.8, coverage: float = 0.8, + translation_table: int = 11, disable_bar: bool = False + ) -> Tuple[Path, Dict[str, GeneFamily]]: """ Assign gene families from a pangenome to input sequences. @@ -466,9 +466,10 @@ def get_input_seq_to_family_with_all(pangenome: Pangenome, sequence_files: Path, with tempfile.NamedTemporaryFile(mode="w", dir=tmpdir.as_posix(), delete=False, prefix="all_pangenome_genes", suffix=".fna") as tmp_pang_file: logging.getLogger("PPanGGOLiN").debug(f'Write all pangenome gene sequences in {tmp_pang_file.name}') + pangenome_sequences = Path(tmp_pang_file.name) write_all_gene_sequences(pangenome, tmp_pang_file, add="ppanggolin_", disable_bar=disable_bar) - align_file = align_seq_to_pang(target_seq_file=tmp_pang_file, query_seq_files=sequence_files, tmpdir=tmpdir, + align_file = align_seq_to_pang(target_seq_file=pangenome_sequences, query_seq_files=sequence_files, tmpdir=tmpdir, cpu=cpu, no_defrag=no_defrag, identity=identity, coverage=coverage, is_query_nt=is_input_seq_nt, is_query_slf=is_input_slf, is_target_nt=True, is_target_slf=True, translation_table=translation_table) diff --git a/ppanggolin/cluster/cluster.py b/ppanggolin/cluster/cluster.py index 85d99ebc..639c578e 100644 --- a/ppanggolin/cluster/cluster.py +++ b/ppanggolin/cluster/cluster.py @@ -70,7 +70,7 @@ def check_pangenome_for_clustering(pangenome: Pangenome, tmp_file: TextIO, force "or provide a way to access the gene sequence during the annotation step " "(having the fasta in the gff files, or providing the fasta files through the --fasta option)") -def first_clustering(sequences: TextIO, tmpdir: Path, cpu: int = 1, code: int = 11, coverage: float = 0.8, +def first_clustering(sequences: Path, tmpdir: Path, cpu: int = 1, code: int = 11, coverage: float = 0.8, identity: float = 0.8, mode: int = 1) -> Tuple[Path, Path]: """ Make a first clustering of all sequences in pangenome @@ -304,10 +304,11 @@ def clustering(pangenome: Pangenome, tmpdir: Path, cpu: int = 1, defrag: bool = newtmpdir = tempfile.TemporaryDirectory(dir=tmpdir) tmp_path = Path(newtmpdir.name) - with open(tmp_path/'nucleotid_sequences', "w") as sequence_file: + sequence_path = tmp_path/'nucleotid_sequences' + with open(sequence_path, "w") as sequence_file: check_pangenome_for_clustering(pangenome, sequence_file, force, disable_bar=disable_bar) logging.getLogger("PPanGGOLiN").info("Clustering all of the genes sequences...") - rep, tsv = first_clustering(sequence_file, tmp_path, cpu, code, coverage, identity, mode) + rep, tsv = first_clustering(sequence_path, tmp_path, cpu, code, coverage, identity, mode) fam2seq = read_faa(rep) if not defrag: diff --git a/ppanggolin/context/searchGeneContext.py b/ppanggolin/context/searchGeneContext.py index 5856997e..0312e4fa 100644 --- a/ppanggolin/context/searchGeneContext.py +++ b/ppanggolin/context/searchGeneContext.py @@ -69,24 +69,23 @@ def search_gene_context_in_pangenome(pangenome: Pangenome, output: Path, tmpdir: if sequence_file is not None: # Alignment of sequences on pangenome families with read_compressed_or_not(sequence_file) as seqFileObj: - seq_set, is_nucleotide = get_seq_ids(seqFileObj) + seq_set, is_nucleotide, is_slf = get_seq_ids(seqFileObj) logging.debug(f"Input sequences are {'nucleotide' if is_nucleotide else 'protein'} sequences") with create_tmpdir(main_dir=tmpdir, basename="align_input_seq_tmpdir", keep_tmp=keep_tmp) as new_tmpdir: if use_representatives: - _, seqid2fam = get_input_seq_to_family_with_rep(pangenome, [sequence_file], output, - new_tmpdir, is_input_seq_nt=is_nucleotide, + _, seqid2fam = get_input_seq_to_family_with_rep(pangenome, sequence_file, output, new_tmpdir, + is_input_seq_nt=is_nucleotide, is_input_slf=is_slf, cpu=cpu, no_defrag=no_defrag, identity=identity, coverage=coverage, translation_table=translation_table, disable_bar=disable_bar) else: - _, seqid2fam = get_input_seq_to_family_with_all(pangenome=pangenome, - sequence_files=[sequence_file], + _, seqid2fam = get_input_seq_to_family_with_all(pangenome=pangenome, sequence_files=sequence_file, output=output, tmpdir=new_tmpdir, - is_input_seq_nt=is_nucleotide, + is_input_seq_nt=is_nucleotide, is_input_slf=is_slf, cpu=cpu, no_defrag=no_defrag, identity=identity, coverage=coverage, translation_table=translation_table, diff --git a/ppanggolin/figures/draw_spot.py b/ppanggolin/figures/draw_spot.py index dc27baa1..bbd65bb5 100644 --- a/ppanggolin/figures/draw_spot.py +++ b/ppanggolin/figures/draw_spot.py @@ -156,7 +156,7 @@ def line_order_gene_lists(gene_lists: list, overlapping_match: int, exact_match: new_classify = set() -def subgraph(spot: Spot, outname: str, with_border: bool = True, set_size: int = 3, +def subgraph(spot: Spot, outname: Path, with_border: bool = True, set_size: int = 3, multigenics: set = None, fam_to_mod: dict = None): """ Write a pangeome subgraph of the gene families of a spot in gexf format @@ -212,7 +212,7 @@ def subgraph(spot: Spot, outname: str, with_border: bool = True, set_size: int = if "name" in g.nodes[node]: g.nodes[node]["name"] = g.nodes[node]["name"].most_common(1)[0][0] - nx.write_gexf(g, outname) + nx.write_gexf(g, outname.absolute().as_posix()) def is_gene_list_ordered(genes:List[Feature]): """ @@ -538,7 +538,7 @@ def add_genome_tools(fig, gene_recs: GlyphRenderer, genome_recs: GlyphRenderer, return column(genome_header, slider_spacing, slider_font, slider_offset) -def draw_curr_spot(gene_lists: list, ordered_counts: list, fam_to_mod: dict, fam_col: dict, file_name: str): +def draw_curr_spot(gene_lists: list, ordered_counts: list, fam_to_mod: dict, fam_col: dict, output: Path): """ :param gene_lists: @@ -550,7 +550,7 @@ def draw_curr_spot(gene_lists: list, ordered_counts: list, fam_to_mod: dict, fam """ # Prepare the source data - output_file(file_name + ".html") + output_file(output) # generate the figure and add some tools to it wheel_zoom = WheelZoomTool() @@ -608,10 +608,10 @@ def draw_selected_spots(selected_spots: Union[List[Spot], Set[Spot]], pangenome: fam2mod[fam] = f"module_{mod.ID}" for spot in tqdm(selected_spots, total=len(selected_spots), unit="spot", disable=disable_bar): - fname = output / f"spot_{str(spot.ID)}" - + basename = f"spot_{str(spot.ID)}" + identical_rgp_out = output / (basename + '_identical_rgps.tsv') # write rgps representatives and the rgps they are identical to - with open(fname.absolute().as_posix() + '_identical_rgps.tsv', 'w') as out_struc: + with open(identical_rgp_out, 'w') as out_struc: out_struc.write('representative_rgp\trepresentative_rgp_genome\tidentical_rgp\tidentical_rgp_genome\n') for key_rgp, other_rgps in spot.get_uniq_to_rgp().items(): for rgp in other_rgps: @@ -667,9 +667,10 @@ def draw_selected_spots(selected_spots: Union[List[Spot], Set[Spot]], pangenome: uniq_gene_lists.append(genelist) ordered_counts.append(curr_genelist_count) - draw_curr_spot(uniq_gene_lists, ordered_counts, fam2mod, famcolors, fname.absolute().as_posix()) - subgraph(spot, fname.absolute().as_posix() + ".gexf", set_size=set_size, - multigenics=multigenics, fam_to_mod=fam2mod) + draw_spot_out = output / (basename + ".html") + subgraph_out = output / (basename + ".gexf") + draw_curr_spot(uniq_gene_lists, ordered_counts, fam2mod, famcolors, draw_spot_out) + subgraph(spot, subgraph_out, set_size=set_size, multigenics=multigenics, fam_to_mod=fam2mod) logging.getLogger("PPanGGOLiN").info(f"Done drawing spot(s), they can be found in the directory: '{output}'") diff --git a/ppanggolin/formats/writeSequences.py b/ppanggolin/formats/writeSequences.py index 24d1ab76..f27d44ba 100644 --- a/ppanggolin/formats/writeSequences.py +++ b/ppanggolin/formats/writeSequences.py @@ -6,8 +6,9 @@ import logging import re import subprocess +import sys from pathlib import Path -from typing import TextIO, Dict, Set, Iterable +from typing import TextIO, Dict, Set, Iterable, Union import tempfile import shutil @@ -134,7 +135,7 @@ def write_gene_sequences_from_annotations(genes_to_write: Iterable[Gene], file_o file_obj.flush() -def create_mmseqs_db(sequences: TextIO, db_name: str, tmpdir: Path, db_mode: int = 0, db_type: int = 0) -> Path: +def create_mmseqs_db(sequences: Iterable[Path], db_name: str, tmpdir: Path, db_mode: int = 0, db_type: int = 0) -> Path: """Create a MMseqs2 database from a sequences file. :param sequences: File with the sequences @@ -149,15 +150,16 @@ def create_mmseqs_db(sequences: TextIO, db_name: str, tmpdir: Path, db_mode: int assert db_type in [0, 1, 2], f"dbtype must be 0, 1 or 2, given {db_type}" seq_nucdb = tmpdir / db_name - cmd = list(map(str, ["mmseqs", "createdb", "--createdb-mode", db_mode, - "--dbtype", db_type, sequences.name, seq_nucdb])) + cmd = ["mmseqs", "createdb", "--createdb-mode", db_mode, "--dbtype", db_type] + cmd += [seq.absolute().as_posix() for seq in sequences] + [seq_nucdb] + cmd = list(map(str, cmd)) logging.getLogger("PPanGGOLiN").debug(" ".join(cmd)) logging.getLogger("PPanGGOLiN").info("Creating sequence database...") subprocess.run(cmd, stdout=subprocess.DEVNULL, check=True) return seq_nucdb -def translate_genes(sequences: TextIO, db_name: str, tmpdir: Path, threads: int = 1, +def translate_genes(sequences: Union[Path, Iterable[Path]], db_name: str, tmpdir: Path, threads: int = 1, is_single_line_fasta: bool = False, code: int = 11) -> Path: """Translate nucleotide sequences into MMSeqs2 amino acid sequences database @@ -170,8 +172,8 @@ def translate_genes(sequences: TextIO, db_name: str, tmpdir: Path, threads: int :return: Path to the MMSeqs2 database """ - seq_nucdb = create_mmseqs_db(sequences, 'nucleotide_sequences_db', tmpdir, - db_mode=1 if is_single_line_fasta else 0, db_type=2) + seq_nucdb = create_mmseqs_db([sequences] if isinstance(sequences, Path) else sequences, f'{db_name}_nt_db', + tmpdir, db_mode=1 if is_single_line_fasta else 0, db_type=2) logging.getLogger("PPanGGOLiN").debug("Translate sequence ...") seqdb = tmpdir / db_name cmd = list(map(str, ["mmseqs", "translatenucs", seq_nucdb, seqdb, "--threads", threads, "--translation-table", code])) @@ -223,14 +225,14 @@ def write_gene_sequences(pangenome: Pangenome, output: Path, genes: str, soft_co logging.getLogger("PPanGGOLiN").info(f"Done writing the gene sequences : '{outpath}'") -def write_gene_protein_sequences(pangenome: Pangenome, output: Path, genes_prot: str, soft_core: float = 0.95, +def write_gene_protein_sequences(pangenome: Pangenome, output: Path, proteins: str, soft_core: float = 0.95, compress: bool = False, keep_tmp: bool = False, tmp: Path = None, threads: int = 1, code: int = 11, disable_bar: bool = False): """ Write all amino acid sequences from given genes in pangenome :param pangenome: Pangenome object with gene families sequences :param output: Path to output directory - :param genes_prot: Selected partition of gene + :param proteins: Selected partition of gene :param soft_core: Soft core threshold to use :param compress: Compress the file in .gz :param keep_tmp: Keep temporary directory @@ -242,11 +244,11 @@ def write_gene_protein_sequences(pangenome: Pangenome, output: Path, genes_prot: tmpdir = tmp / "translateGenes" if tmp is not None else Path(f"{tempfile.gettempdir()}/translateGenes") mk_outdir(tmpdir, True, True) - write_gene_sequences(pangenome, tmpdir, genes_prot, soft_core, compress, disable_bar) + write_gene_sequences(pangenome, tmpdir, proteins, soft_core, compress, disable_bar) - with open(tmpdir / f"{genes_prot}_genes.fna") as sequences: - translate_db = translate_genes(sequences, 'aa_db', tmpdir, threads, True, code) - outpath = output / f"{genes_prot}_protein_genes.fna" + pangenome_sequences = tmpdir / f"{proteins}_genes.fna" + translate_db = translate_genes(pangenome_sequences, 'aa_db', tmpdir, threads, True, code) + outpath = output / f"{proteins}_protein_genes.fna" cmd = list(map(str, ["mmseqs", "convert2fasta", translate_db, outpath])) logging.getLogger("PPanGGOLiN").debug(" ".join(cmd)) subprocess.run(cmd, stdout=subprocess.DEVNULL, check=True) @@ -527,7 +529,7 @@ def write_regions_sequences(pangenome: Pangenome, output: Path, regions: str, fa def write_sequence_files(pangenome: Pangenome, output: Path, fasta: Path = None, anno: Path = None, - soft_core: float = 0.95, regions: str = None, genes: str = None, genes_prot: str = None, + soft_core: float = 0.95, regions: str = None, genes: str = None, proteins: str = None, gene_families: str = None, prot_families: str = None, compress: bool = False, disable_bar: bool = False, **translate_kwgs): """ @@ -540,14 +542,14 @@ def write_sequence_files(pangenome: Pangenome, output: Path, fasta: Path = None, :param soft_core: Soft core threshold to use :param regions: Write the RGP nucleotide sequences :param genes: Write all nucleotide CDS sequences - :param genes_prot: Write amino acid CDS sequences. + :param proteins: Write amino acid CDS sequences. :param gene_families: Write representative nucleotide sequences of gene families. :param prot_families: Write representative amino acid sequences of gene families. :param compress: Compress the file in .gz :param disable_bar: Disable progress bar """ - check_pangenome_to_write_sequences(pangenome, regions, genes, genes_prot, gene_families, prot_families, disable_bar) + check_pangenome_to_write_sequences(pangenome, regions, genes, proteins, gene_families, prot_families, disable_bar) if prot_families is not None: write_fasta_prot_fam(pangenome, output, prot_families, soft_core, compress, disable_bar) @@ -555,9 +557,9 @@ def write_sequence_files(pangenome: Pangenome, output: Path, fasta: Path = None, write_fasta_gene_fam(pangenome, output, gene_families, soft_core, compress, disable_bar) if genes is not None: write_gene_sequences(pangenome, output, genes, soft_core, compress, disable_bar) - if genes_prot is not None: - write_gene_protein_sequences(pangenome, output, genes_prot, soft_core, compress, - disable_bar=disable_bar, **translate_kwgs) + if proteins is not None: + write_gene_protein_sequences(pangenome, output, proteins, soft_core, compress, disable_bar=disable_bar, + **translate_kwgs) if regions is not None: write_regions_sequences(pangenome, output, regions, fasta, anno, compress, disable_bar) @@ -577,7 +579,7 @@ def launch(args: argparse.Namespace): pangenome = Pangenome() pangenome.add_file(args.pangenome) write_sequence_files(pangenome, args.output, fasta=args.fasta, anno=args.anno, soft_core=args.soft_core, - regions=args.regions, genes=args.genes, genes_prot=args.genes_prot, + regions=args.regions, genes=args.genes, proteins=args.proteins, gene_families=args.gene_families, prot_families=args.prot_families, compress=args.compress, disable_bar=args.disable_prog_bar, **translate_kwgs) diff --git a/ppanggolin/projection/projection.py b/ppanggolin/projection/projection.py index f9227d38..7cdbb8d3 100644 --- a/ppanggolin/projection/projection.py +++ b/ppanggolin/projection/projection.py @@ -538,7 +538,7 @@ def annotate_input_genes_with_pangenome_families(pangenome: Pangenome, input_org """ seq_fasta_files = [] - logging.getLogger('PPanGGOLiN').info('Writting gene sequences of input genomes.') + logging.getLogger('PPanGGOLiN').info('Writing gene sequences of input genomes.') for input_organism in input_organisms: seq_outdir = output / input_organism.name @@ -557,14 +557,14 @@ def annotate_input_genes_with_pangenome_families(pangenome: Pangenome, input_org if use_representatives: _, seqid_to_gene_family = get_input_seq_to_family_with_rep(pangenome, seq_fasta_files, output=new_tmpdir, tmpdir=new_tmpdir, is_input_seq_nt=True, - cpu=cpu, no_defrag=no_defrag, identity=identity, - coverage=coverage, + is_input_slf=False, cpu=cpu, no_defrag=no_defrag, + identity=identity, coverage=coverage, translation_table=translation_table) else: _, seqid_to_gene_family = get_input_seq_to_family_with_all(pangenome=pangenome, sequence_files=seq_fasta_files, output=new_tmpdir, tmpdir=new_tmpdir, - is_input_seq_nt=True, + is_input_seq_nt=True, is_input_slf=False, cpu=cpu, no_defrag=no_defrag, identity=identity, coverage=coverage, translation_table=translation_table, From e09e5f7f68df5c5bff9609a47e755503f3a9a9a4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= Date: Thu, 6 Jun 2024 15:51:34 +0200 Subject: [PATCH 16/25] Improve documentation on write fasta --- docs/user/writeFasta.md | 41 ++++++++++++++++++++++++++++++++--------- 1 file changed, 32 insertions(+), 9 deletions(-) diff --git a/docs/user/writeFasta.md b/docs/user/writeFasta.md index 843f0788..7bae4712 100644 --- a/docs/user/writeFasta.md +++ b/docs/user/writeFasta.md @@ -20,7 +20,8 @@ When using the `softcore` filter, the `--soft_core` option can be used to modify ### Nucleotide sequences -This option can be used to write the nucleotide CDS sequences. It can be used as such, to write all the genes of the pangenome for example: +With the `--genes partition` option PPanGGOLiN will write the nucleotide CDS sequences for the given partition. +It can be used as such, to write all the genes of the pangenome for example: ```bash ppanggolin fasta -p pangenome.h5 --output MY_GENES --genes all @@ -34,7 +35,8 @@ ppanggolin fasta -p pangenome.h5 --output MY_GENES --genes persistent ### Protein sequences -This option can be used to write the amino acid CDS sequences. It can be used as such, to write all the genes of the pangenome for example: +With the `--proteins partition` option PPanGGOLiN will write the nucleotide CDS sequences for the given partition. +It can be used as such, to write all the genes of the pangenome for example: ```bash ppanggolin fasta -p pangenome.h5 --output MY_GENES --proteins all @@ -46,19 +48,23 @@ Or to write only the cloud genes: ppanggolin fasta -p pangenome.h5 --output MY_GENES --genes_prot cloud ``` -To translate the genes sequences, PPanGGOLiN use [MMSeqs2](https://github.com/soedinglab/MMseqs2) `translatenucs` command. So for this option you can give multiple threads with `--threads`. You can also specify the translation table to use with `--translate_table`. Finally, you can keep the [MMSeqs2](https://github.com/soedinglab/MMseqs2) that are generated in the temporary directory (that you can also be specified with `--tmpdir`) by indicate the option `--keep_tmp`. +To translate the gene sequences, PPanGGOLiN uses the [MMSeqs2](https://github.com/soedinglab/MMseqs2) `translatenucs` command. +So for this option you can specify multiple threads with `--cpu`. +You can also specify the translation table to use with `--translate_table`. +Finally, you can keep the temporary directory -that you can specify with `--tmpdir`- with the [MMSeqs2](https://github.com/soedinglab/MMseqs2) database using the `--keep_tmp` option. ## Gene families ### Protein sequences -This option can be used to write the protein sequences of the representative sequences for each family. It can be used as such for all families: +With the `--prot_families partition` option PPanGGOLiN will write the protein sequences of the representative gene for each family for the given partition. +It can be used as such for all families: ```bash ppanggolin fasta -p pangenome.h5 --output MY_PROT --prot_families all ``` -or for all the shell families for example: +Or for all the shell families for example: ```bash ppanggolin fasta -p pangenome.h5 --output MY_PROT --prot_families shell @@ -66,16 +72,33 @@ ppanggolin fasta -p pangenome.h5 --output MY_PROT --prot_families shell ### Nucleotide sequences -This option can be used to write the gene sequences of the representative sequences for each family. It can be used as such: +With the `--gene_families partition` option PPanGGOLiN will write the nucleotide sequences of the representative gene for each family for the given partition. +It can be used as such for all families: ```bash ppanggolin fasta -p pangenome.h5 --output MY_GENES_FAMILIES --gene_families all ``` -or for the cloud families for example: +Or for the core families for example: ```bash -ppanggolin fasta -p pangenome.h5 --output MY_GENES_FAMILIES --gene_families cloud +ppanggolin fasta -p pangenome.h5 --output MY_GENES_FAMILIES --gene_families core +``` + + +## Modules +All the precedent command admit a module as partition. + +So you can write the protein sequences for the family in module_X as such: + +```bash +ppanggolin fasta -p pangenome.h5 --output MY_REGIONS --prot_families module_X +``` + +Or the nucleotide sequence of all genes in module_X: + +```bash +ppanggolin fasta -p pangenome.h5 --output MY_REGIONS --genes module_X ``` ## Regions @@ -91,4 +114,4 @@ It can be used as such: ```bash ppanggolin fasta -p pangenome.h5 --output MY_REGIONS --regions all --fasta genomes.fasta.list -``` +``` \ No newline at end of file From 9c8fb76c8565092f08623de2664b95f736843547 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= Date: Fri, 7 Jun 2024 09:45:20 +0200 Subject: [PATCH 17/25] Fix createdb mode --- .github/workflows/main.yml | 2 +- ppanggolin/align/alignOnPang.py | 38 ++++++++++++------------- ppanggolin/cluster/cluster.py | 3 +- ppanggolin/context/searchGeneContext.py | 2 +- ppanggolin/formats/writeSequences.py | 19 +++++++------ ppanggolin/projection/projection.py | 10 ++++--- pyproject.toml | 6 ++++ 7 files changed, 45 insertions(+), 35 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 94e5f44f..2abe5b7f 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -100,7 +100,7 @@ jobs: ppanggolin fasta -p stepbystep/pangenome.h5 --output stepbystep -f --prot_families module_0 ppanggolin fasta -p stepbystep/pangenome.h5 --output stepbystep -f --prot_families core ppanggolin fasta -p stepbystep/pangenome.h5 --output stepbystep -f --gene_families module_0 --genes module_0 - ppanggolin fasta -p stepbystep/pangenome.h5 --output stepbystep -f --proteins cloud --threads 1 --keep_tmp + ppanggolin fasta -p stepbystep/pangenome.h5 --output stepbystep -f --proteins cloud --cpu $NUM_CPUS --keep_tmp ppanggolin draw -p stepbystep/pangenome.h5 --draw_spots --spots all -o stepbystep -f ppanggolin metrics -p stepbystep/pangenome.h5 --genome_fluidity --no_print_info --recompute_metrics --log metrics.log diff --git a/ppanggolin/align/alignOnPang.py b/ppanggolin/align/alignOnPang.py index 0742a876..0069b798 100644 --- a/ppanggolin/align/alignOnPang.py +++ b/ppanggolin/align/alignOnPang.py @@ -74,8 +74,7 @@ def align_seq_to_pang(target_seq_file: Union[Path, Iterable[Path]], query_seq_fi with tempfile.NamedTemporaryFile(mode="w", dir=tmpdir.as_posix(), prefix="aln_result_db_file", suffix=".aln.DB", delete=False) as aln_db: cmd = ["mmseqs", "search", query_db.as_posix(), target_db.as_posix(), aln_db.name, tmpdir.as_posix(), "-a", - "--min-seq-id", str(identity), - "-c", str(coverage), "--cov-mode", cov_mode, "--threads", str(cpu), + "--min-seq-id", str(identity), "-c", str(coverage), "--cov-mode", cov_mode, "--threads", str(cpu), "--seed-sub-mat", "VTML40.out", "-s", "2", '--comp-bias-corr', "0", "--mask", "0", "-e", "1"] logging.getLogger("PPanGGOLiN").info("Aligning sequences") @@ -180,18 +179,19 @@ def get_seq_ids(seq_file: TextIOWrapper) -> Tuple[Set[str], bool, bool]: seq_set = set() seq_count = 0 first_seq_concat = "" - single_line_fasta = False + single_line_fasta = True count_fasta_line = 0 for line in seq_file: if line.startswith(">"): seq_set.add(line[1:].split()[0].strip()) seq_count += 1 if count_fasta_line > 1: # Allow to know if we can use soft link with createdb from MMSeqs2 - single_line_fasta = True + single_line_fasta = False count_fasta_line = 0 - elif seq_count <= 20: - first_seq_concat += line.strip() - count_fasta_line += 1 + else: + count_fasta_line += 1 + if seq_count <= 20: + first_seq_concat += line.strip() char_counter = Counter(first_seq_concat) is_nucleotide = all(char in dna_expected_char for char in char_counter) @@ -354,18 +354,18 @@ def get_seq_info(seq_to_pang: dict, pangenome: Pangenome, output: Path, draw_rel logging.getLogger("PPanGGOLiN").info("Writing RGP and spot information related to hits in the pangenome") multigenics = pangenome.get_multigenics(pangenome.parameters["rgp"]["dup_margin"]) - finfo = open(output / "info_input_seq.tsv", "w") - finfo.write("input\tfamily\tpartition\tspot_list_as_member\tspot_list_as_border\trgp_list\n") - fam2rgp = get_fam_to_rgp(pangenome, multigenics) - fam2spot, fam2border = get_fam_to_spot(pangenome, multigenics) - spot_list = set() - for seq, panfam in seq_to_pang.items(): - finfo.write(seq + '\t' + panfam.name + "\t" + panfam.named_partition + "\t" + ",".join( - map(str, fam2spot[panfam])) + "\t" + ",".join( - map(str, fam2border[panfam])) + "\t" + ','.join(fam2rgp[panfam]) + "\n") - spot_list |= set(fam2spot[panfam]) - spot_list |= set(fam2border[panfam]) - finfo.close() + with open(output / "info_input_seq.tsv", "w") as finfo: + finfo.write("input\tfamily\tpartition\tspot_list_as_member\tspot_list_as_border\trgp_list\n") + fam2rgp = get_fam_to_rgp(pangenome, multigenics) + fam2spot, fam2border = get_fam_to_spot(pangenome, multigenics) + spot_list = set() + for seq, panfam in seq_to_pang.items(): + finfo.write(seq + '\t' + panfam.name + "\t" + panfam.named_partition + "\t" + ",".join( + map(str, fam2spot[panfam])) + "\t" + ",".join( + map(str, fam2border[panfam])) + "\t" + ','.join(fam2rgp[panfam]) + "\n") + spot_list |= set(fam2spot[panfam]) + spot_list |= set(fam2border[panfam]) + if draw_related: drawn_spots = set() for spot in spot_list: diff --git a/ppanggolin/cluster/cluster.py b/ppanggolin/cluster/cluster.py index 639c578e..c7d3d1f3 100644 --- a/ppanggolin/cluster/cluster.py +++ b/ppanggolin/cluster/cluster.py @@ -85,7 +85,8 @@ def first_clustering(sequences: Path, tmpdir: Path, cpu: int = 1, code: int = 11 :return: path to representative sequence file and path to tsv clustering result """ - seqdb = translate_genes(sequences, 'aa_db', tmpdir, cpu, True, code) + seqdb = translate_genes(sequences=sequences, db_name='aa_db', tmpdir=tmpdir, cpu=cpu, + is_single_line_fasta=True, code=code) logging.getLogger("PPanGGOLiN").info("Clustering sequences...") cludb = tmpdir / 'cluster_db' cmd = list(map(str, ["mmseqs", "cluster", seqdb, cludb, tmpdir, "--cluster-mode", mode, "--min-seq-id", diff --git a/ppanggolin/context/searchGeneContext.py b/ppanggolin/context/searchGeneContext.py index 0312e4fa..508203fc 100644 --- a/ppanggolin/context/searchGeneContext.py +++ b/ppanggolin/context/searchGeneContext.py @@ -650,7 +650,7 @@ def parser_context(parser: argparse.ArgumentParser): default='graphml', choices=['gexf', 'graphml']) optional.add_argument("-c", "--cpu", required=False, default=1, type=int, help="Number of available cpus") - optional.add_argument("--tmpdir", required=False, type=str, default=Path(tempfile.gettempdir()), + optional.add_argument("--tmpdir", required=False, type=Path, default=Path(tempfile.gettempdir()), help="directory for storing temporary files") optional.add_argument("--keep_tmp", required=False, default=False, action="store_true", help="Keeping temporary files (useful for debugging).") diff --git a/ppanggolin/formats/writeSequences.py b/ppanggolin/formats/writeSequences.py index f27d44ba..bc88ec20 100644 --- a/ppanggolin/formats/writeSequences.py +++ b/ppanggolin/formats/writeSequences.py @@ -151,7 +151,7 @@ def create_mmseqs_db(sequences: Iterable[Path], db_name: str, tmpdir: Path, db_m seq_nucdb = tmpdir / db_name cmd = ["mmseqs", "createdb", "--createdb-mode", db_mode, "--dbtype", db_type] - cmd += [seq.absolute().as_posix() for seq in sequences] + [seq_nucdb] + cmd += [seq.absolute().as_posix() for seq in sequences] + [seq_nucdb.absolute()] cmd = list(map(str, cmd)) logging.getLogger("PPanGGOLiN").debug(" ".join(cmd)) logging.getLogger("PPanGGOLiN").info("Creating sequence database...") @@ -159,14 +159,14 @@ def create_mmseqs_db(sequences: Iterable[Path], db_name: str, tmpdir: Path, db_m return seq_nucdb -def translate_genes(sequences: Union[Path, Iterable[Path]], db_name: str, tmpdir: Path, threads: int = 1, +def translate_genes(sequences: Union[Path, Iterable[Path]], db_name: str, tmpdir: Path, cpu: int = 1, is_single_line_fasta: bool = False, code: int = 11) -> Path: """Translate nucleotide sequences into MMSeqs2 amino acid sequences database :param sequences: File with the nucleotide sequences :param db_name: name of the output database :param tmpdir: Temporary directory to save the MMSeqs2 files - :param threads: Number of available threads to use + :param cpu: Number of available threads to use :param is_single_line_fasta: Allow to use soft link in MMSeqs2 database :param code: Translation code to use @@ -176,7 +176,7 @@ def translate_genes(sequences: Union[Path, Iterable[Path]], db_name: str, tmpdir tmpdir, db_mode=1 if is_single_line_fasta else 0, db_type=2) logging.getLogger("PPanGGOLiN").debug("Translate sequence ...") seqdb = tmpdir / db_name - cmd = list(map(str, ["mmseqs", "translatenucs", seq_nucdb, seqdb, "--threads", threads, "--translation-table", code])) + cmd = list(map(str, ["mmseqs", "translatenucs", seq_nucdb, seqdb, "--threads", cpu, "--translation-table", code])) logging.getLogger("PPanGGOLiN").debug(" ".join(cmd)) subprocess.run(cmd, stdout=subprocess.DEVNULL, check=True) return seqdb @@ -227,7 +227,7 @@ def write_gene_sequences(pangenome: Pangenome, output: Path, genes: str, soft_co def write_gene_protein_sequences(pangenome: Pangenome, output: Path, proteins: str, soft_core: float = 0.95, compress: bool = False, keep_tmp: bool = False, tmp: Path = None, - threads: int = 1, code: int = 11, disable_bar: bool = False): + cpu: int = 1, code: int = 11, disable_bar: bool = False): """ Write all amino acid sequences from given genes in pangenome :param pangenome: Pangenome object with gene families sequences @@ -237,7 +237,7 @@ def write_gene_protein_sequences(pangenome: Pangenome, output: Path, proteins: s :param compress: Compress the file in .gz :param keep_tmp: Keep temporary directory :param tmp: Path to temporary directory - :param threads: Number of threads available + :param cpu: Number of threads available :param code: Genetic code use to translate nucleotide sequences to protein sequences :param disable_bar: Disable progress bar """ @@ -247,7 +247,8 @@ def write_gene_protein_sequences(pangenome: Pangenome, output: Path, proteins: s write_gene_sequences(pangenome, tmpdir, proteins, soft_core, compress, disable_bar) pangenome_sequences = tmpdir / f"{proteins}_genes.fna" - translate_db = translate_genes(pangenome_sequences, 'aa_db', tmpdir, threads, True, code) + translate_db = translate_genes(sequences=pangenome_sequences, db_name='aa_db', tmpdir=tmpdir, + cpu=cpu, is_single_line_fasta=True, code=code) outpath = output / f"{proteins}_protein_genes.fna" cmd = list(map(str, ["mmseqs", "convert2fasta", translate_db, outpath])) logging.getLogger("PPanGGOLiN").debug(" ".join(cmd)) @@ -572,7 +573,7 @@ def launch(args: argparse.Namespace): """ check_write_sequences_args(args) translate_kwgs = {"code": args.translation_table, - "threads": args.threads, + "cpu": args.cpu, "tmp": args.tmpdir, "keep_tmp": args.keep_tmp} mk_outdir(args.output, args.force) @@ -661,7 +662,7 @@ def parser_seq(parser: argparse.ArgumentParser): optional.add_argument("--compress", required=False, action="store_true", help="Compress the files in .gz") optional.add_argument("--translation_table", required=False, default="11", help="Translation table (genetic code) to use.") - optional.add_argument("--threads", required=False, default=1, type=int, help="Number of available threads") + optional.add_argument("--cpu", required=False, default=1, type=int, help="Number of available threads") optional.add_argument("--tmpdir", required=False, type=Path, default=Path(tempfile.gettempdir()), help="directory for storing temporary files") optional.add_argument("--keep_tmp", required=False, default=False, action="store_true", diff --git a/ppanggolin/projection/projection.py b/ppanggolin/projection/projection.py index 7cdbb8d3..cddfd338 100644 --- a/ppanggolin/projection/projection.py +++ b/ppanggolin/projection/projection.py @@ -555,16 +555,18 @@ def annotate_input_genes_with_pangenome_families(pangenome: Pangenome, input_org with create_tmpdir(main_dir=tmpdir, basename="align_input_seq_tmpdir", keep_tmp=keep_tmp) as new_tmpdir: if use_representatives: - _, seqid_to_gene_family = get_input_seq_to_family_with_rep(pangenome, seq_fasta_files, output=new_tmpdir, - tmpdir=new_tmpdir, is_input_seq_nt=True, - is_input_slf=False, cpu=cpu, no_defrag=no_defrag, + _, seqid_to_gene_family = get_input_seq_to_family_with_rep(pangenome=pangenome, + sequence_files=seq_fasta_files, + output=new_tmpdir, tmpdir=new_tmpdir, + is_input_seq_nt=True, is_input_slf=True, + cpu=cpu, no_defrag=no_defrag, identity=identity, coverage=coverage, translation_table=translation_table) else: _, seqid_to_gene_family = get_input_seq_to_family_with_all(pangenome=pangenome, sequence_files=seq_fasta_files, output=new_tmpdir, tmpdir=new_tmpdir, - is_input_seq_nt=True, is_input_slf=False, + is_input_seq_nt=True, is_input_slf=True, cpu=cpu, no_defrag=no_defrag, identity=identity, coverage=coverage, translation_table=translation_table, diff --git a/pyproject.toml b/pyproject.toml index 7864f533..abdbf2d7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -81,3 +81,9 @@ packages = ["ppanggolin"] [tool.setuptools.dynamic] version = {file = "VERSION"} + +[tool.pytest.ini_options] +markers = [ + "nucleotides: marks tests to run with nucleotides alphabets", + "aminoacids: marks tests to run with amino-acids alphabets", +] \ No newline at end of file From 56e4318dbe33dc6ab92e6f518b1ce5b7b11ab9f5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= Date: Fri, 7 Jun 2024 10:28:47 +0200 Subject: [PATCH 18/25] Improve documentation on write fasta --- docs/user/writeFasta.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/user/writeFasta.md b/docs/user/writeFasta.md index 7bae4712..10725d5b 100644 --- a/docs/user/writeFasta.md +++ b/docs/user/writeFasta.md @@ -49,9 +49,9 @@ ppanggolin fasta -p pangenome.h5 --output MY_GENES --genes_prot cloud ``` To translate the gene sequences, PPanGGOLiN uses the [MMSeqs2](https://github.com/soedinglab/MMseqs2) `translatenucs` command. -So for this option you can specify multiple threads with `--cpu`. -You can also specify the translation table to use with `--translate_table`. -Finally, you can keep the temporary directory -that you can specify with `--tmpdir`- with the [MMSeqs2](https://github.com/soedinglab/MMseqs2) database using the `--keep_tmp` option. +So for this option you can specify multiple threads with `--cpu`. +You can also specify the translation table to use with `--translate_table`. +The temporary directory, can be specified with `--tmpdir` to store the [MMSeqs2](https://github.com/soedinglab/MMseqs2) database and other files. Temporary files will be deleted at the end of the execution. To keep them, you can use the `--keep_tmp` option. ## Gene families From 13bd4e0aed2ca17ce5655ebb0a1ac1854da621d4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= Date: Fri, 7 Jun 2024 15:02:58 +0200 Subject: [PATCH 19/25] Add test unit for align --- tests/align/test_align.py | 111 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 111 insertions(+) create mode 100644 tests/align/test_align.py diff --git a/tests/align/test_align.py b/tests/align/test_align.py new file mode 100644 index 00000000..e5ce2dde --- /dev/null +++ b/tests/align/test_align.py @@ -0,0 +1,111 @@ +import pytest +from typing import List +from random import choice, randint + +from ppanggolin.align.alignOnPang import get_seq_ids + + +@pytest.fixture +def nucleotides(): + yield ['A', 'C', 'G', 'T', 'N'] + +@pytest.fixture +def aminoacids(): + amino_acid_alphabet = [ + 'A', # Alanine + 'R', # Arginine + 'N', # Asparagine + 'D', # Aspartic acid + 'C', # Cysteine + 'E', # Glutamic acid + 'Q', # Glutamine + 'G', # Glycine + 'H', # Histidine + 'I', # Isoleucine + 'L', # Leucine + 'K', # Lysine + 'M', # Methionine + 'F', # Phenylalanine + 'P', # Proline + 'S', # Serine + 'T', # Threonine + 'W', # Tryptophan + 'Y', # Tyrosine + 'V' # Valine + ] + yield amino_acid_alphabet + +@pytest.fixture +def number_of_sequences(): + yield randint(4, 10) +@pytest.fixture +def single_line_fasta(request, tmp_path_factory: pytest.TempPathFactory, number_of_sequences, + aminoacids: List[str], nucleotides: List[str]): + if request.node.get_closest_marker('aminoacids'): + alphabet = aminoacids + fasta_path = tmp_path_factory.getbasetemp() / "single_line_nt.fasta" + else: + alphabet = nucleotides + fasta_path = tmp_path_factory.getbasetemp() / "single_line_aa.fasta" + with open(fasta_path, "w") as fasta: + for i in range(number_of_sequences): + fasta.write(f">Gene_{i}\n") + fasta.write("".join([choice(alphabet) for _ in range(0, randint(30, 100))])) + fasta.write("\n") + yield fasta_path + +@pytest.fixture +def multi_line_fasta(request, tmp_path_factory: pytest.TempPathFactory, number_of_sequences, + aminoacids: List[str], nucleotides: List[str]): + if request.node.get_closest_marker('aminoacids'): + alphabet = aminoacids + fasta_path = tmp_path_factory.getbasetemp() / "single_line_nt.fasta" + else: + alphabet = nucleotides + fasta_path = tmp_path_factory.getbasetemp() / "single_line_aa.fasta" + with open(fasta_path, "w") as fasta: + for i in range(number_of_sequences): + fasta.write(f">Gene_{i}\n") + for j in range(randint(4,10)): + fasta.write("".join([choice(alphabet) for _ in range(60)])) + fasta.write("\n") + yield fasta_path + +@pytest.mark.nucleotides +def test_get_seq_ids_single_line_nt(number_of_sequences, single_line_fasta): + with open(single_line_fasta, "r") as fasta: + seq_set, is_nucleotide, single_line_fasta = get_seq_ids(fasta) + assert len(seq_set) == number_of_sequences + assert seq_set == {f"Gene_{i}" for i in range(number_of_sequences)} + assert is_nucleotide + assert single_line_fasta + +@pytest.mark.aminoacids +def test_get_seq_ids_single_line_aa(number_of_sequences, single_line_fasta): + with open(single_line_fasta, "r") as fasta: + seq_set, is_nucleotide, single_line_fasta = get_seq_ids(fasta) + assert len(seq_set) == number_of_sequences + assert seq_set == {f"Gene_{i}" for i in range(number_of_sequences)} + assert not is_nucleotide + assert single_line_fasta + +@pytest.mark.nucleotides +def test_get_seq_ids_multi_line_nt(number_of_sequences, multi_line_fasta): + with open(multi_line_fasta, "r") as fasta: + seq_set, is_nucleotide, single_line_fasta = get_seq_ids(fasta) + assert len(seq_set) == number_of_sequences + assert seq_set == {f"Gene_{i}" for i in range(number_of_sequences)} + assert is_nucleotide + assert not single_line_fasta + +@pytest.mark.aminoacids +def test_get_seq_ids_multi_line_aa(number_of_sequences, multi_line_fasta): + with open(multi_line_fasta, "r") as fasta: + seq_set, is_nucleotide, single_line_fasta = get_seq_ids(fasta) + assert len(seq_set) == number_of_sequences + assert seq_set == {f"Gene_{i}" for i in range(number_of_sequences)} + assert not is_nucleotide + assert not single_line_fasta + + + From b5b0e4dcfe2eecd7e7d1c9f2ea9e13fe0562a2eb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= Date: Fri, 7 Jun 2024 15:28:13 +0200 Subject: [PATCH 20/25] Use context manager to write sequences --- ppanggolin/align/alignOnPang.py | 74 ++++--- ppanggolin/cluster/cluster.py | 60 +++--- ppanggolin/formats/readBinaries.py | 79 ++++---- ppanggolin/formats/writeSequences.py | 107 +++++----- ppanggolin/projection/projection.py | 291 ++++++++++++++------------- 5 files changed, 305 insertions(+), 306 deletions(-) diff --git a/ppanggolin/align/alignOnPang.py b/ppanggolin/align/alignOnPang.py index 0069b798..2b5547d3 100644 --- a/ppanggolin/align/alignOnPang.py +++ b/ppanggolin/align/alignOnPang.py @@ -9,7 +9,7 @@ import subprocess import argparse from collections import defaultdict, Counter -from typing import List, Tuple, Set, Dict, TextIO, Union, Iterable, Any +from typing import List, Tuple, Set, Dict, Union, Iterable, Any from pathlib import Path from tqdm import tqdm @@ -25,8 +25,8 @@ from ppanggolin.formats.writeSequences import translate_genes, create_mmseqs_db -def align_seq_to_pang(target_seq_file: Union[Path, Iterable[Path]], query_seq_files: Union[Path, Iterable[Path]], tmpdir: Path, cpu: int = 1, - no_defrag: bool = False, identity: float = 0.8, coverage: float = 0.8, +def align_seq_to_pang(target_seq_file: Union[Path, Iterable[Path]], query_seq_files: Union[Path, Iterable[Path]], + tmpdir: Path, cpu: int = 1, no_defrag: bool = False, identity: float = 0.8, coverage: float = 0.8, is_query_nt: bool = False, is_query_slf: bool = False, is_target_nt: bool = False, is_target_slf: bool = False, translation_table: int = None) -> Path: """ @@ -51,7 +51,9 @@ def align_seq_to_pang(target_seq_file: Union[Path, Iterable[Path]], query_seq_fi if is_target_nt: logging.getLogger("PPanGGOLiN").debug("Target sequences will be translated by mmseqs with " f"translation table {translation_table}") - target_db = translate_genes(target_seq_file, 'target_db', tmpdir, cpu, is_target_slf, translation_table) + with create_tmpdir(tmpdir, basename="target_db", keep_tmp=True) as target_db_dir: + # Keep is set as true because whether tmpdir is deleted or not target_db_dir will be the same + target_db = translate_genes(target_seq_file, target_db_dir, cpu, is_target_slf, translation_table) else: target_db = create_mmseqs_db([target_seq_file] if isinstance(target_seq_file, Path) else target_seq_file, 'target_db', tmpdir, db_mode=1 if is_target_slf else 0, db_type=1) @@ -59,7 +61,9 @@ def align_seq_to_pang(target_seq_file: Union[Path, Iterable[Path]], query_seq_fi if is_query_nt: logging.getLogger("PPanGGOLiN").debug("Query sequences will be translated by mmseqs " f"with translation table {translation_table}") - query_db = translate_genes(query_seq_files, 'query_db', tmpdir, cpu, is_query_slf, translation_table) + with create_tmpdir(tmpdir, basename="query_db", keep_tmp=True) as query_db_dir: + # Keep is set as true because whether tmpdir is deleted or not target_db_dir will be the same + query_db = translate_genes(query_seq_files, query_db_dir, cpu, is_query_slf, translation_table) else: query_db = create_mmseqs_db([query_seq_files] if isinstance(query_seq_files, Path) else query_seq_files, 'query_db', tmpdir, db_mode=1 if is_query_slf else 0, db_type=1) @@ -199,35 +203,34 @@ def get_seq_ids(seq_file: TextIOWrapper) -> Tuple[Set[str], bool, bool]: return seq_set, is_nucleotide, single_line_fasta -def write_gene_fam_sequences(pangenome: Pangenome, file_obj: TextIO, add: str = "", disable_bar: bool = False): +def write_gene_fam_sequences(pangenome: Pangenome, output: Path, add: str = "", disable_bar: bool = False): """ Export the sequence of gene families :param pangenome: Pangenome containing families - :param file_obj: Temporary file where sequences will be written + :param output: Path to file where sequences will be written :param add: Add prefix to sequence name :param disable_bar: disable progress bar """ - for fam in tqdm(pangenome.gene_families, unit="families", disable=disable_bar, - total=pangenome.number_of_gene_families): - file_obj.write(">" + add + fam.name + "\n") - file_obj.write(fam.sequence + "\n") - # file_obj.flush() + with open(output, "w") as file_obj: + for fam in tqdm(pangenome.gene_families, unit="families", disable=disable_bar, + total=pangenome.number_of_gene_families): + file_obj.write(">" + add + fam.name + "\n") + file_obj.write(fam.sequence + "\n") -def write_all_gene_sequences(pangenome: Pangenome, file_obj: TextIO, add: str = "", disable_bar: bool = False): +def write_all_gene_sequences(pangenome: Pangenome, output: Path, add: str = "", disable_bar: bool = False): """ Export the sequence of pangenome genes :param pangenome: Pangenome containing genes - :param file_obj: Temporary file where sequences will be written + :param output: Path to file where sequences will be written :param add: Add prefix to sequence name :param disable_bar: disable progress bar - """ if pangenome.status["geneSequences"] == "inFile": - get_non_redundant_gene_sequences_from_file(pangenome.file, file_obj, add=add, disable_bar=disable_bar) + get_non_redundant_gene_sequences_from_file(pangenome.file, output, add=add, disable_bar=disable_bar) else: # this should never happen if the pangenome has been properly checked before launching this function. raise Exception("The pangenome does not include gene sequences") @@ -417,20 +420,16 @@ def get_input_seq_to_family_with_rep(pangenome: Pangenome, sequence_files: Union and a dictionary mapping input sequences to gene families. """ - # delete False to be able to keep tmp file. If they are not keep tmpdir will be destroyed so no need to delete tmpfile - with tempfile.NamedTemporaryFile(mode="w", dir=tmpdir.as_posix(), delete=False, - prefix="representative_genes", suffix=".faa") as tmp_pang_file: - logging.getLogger("PPanGGOLiN").debug(f'Write gene family sequences in {tmp_pang_file.name}') - pangenome_sequences = Path(tmp_pang_file.name) - write_gene_fam_sequences(pangenome, tmp_pang_file, add="ppanggolin_", disable_bar=disable_bar) + pangenome_sequences = tmpdir / "proteins_families.faa" + logging.getLogger("PPanGGOLiN").debug(f'Write gene family sequences in {pangenome_sequences.absolute()}') + write_gene_fam_sequences(pangenome, pangenome_sequences, add="ppanggolin_", disable_bar=disable_bar) - align_file = align_seq_to_pang(target_seq_file=pangenome_sequences, query_seq_files=sequence_files, - tmpdir=tmpdir, cpu=cpu, - no_defrag=no_defrag, identity=identity, coverage=coverage, - is_query_nt=is_input_seq_nt, is_query_slf=is_input_slf, - is_target_nt=False, is_target_slf=False, translation_table=translation_table) + align_file = align_seq_to_pang(target_seq_file=pangenome_sequences, query_seq_files=sequence_files, + tmpdir=tmpdir, cpu=cpu, no_defrag=no_defrag, identity=identity, coverage=coverage, + is_query_nt=is_input_seq_nt, is_query_slf=is_input_slf, + is_target_nt=False, is_target_slf=True, translation_table=translation_table) - seq2pang, align_file = map_input_gene_to_family_rep_aln(align_file, output, pangenome) + seq2pang, align_file = map_input_gene_to_family_rep_aln(align_file, output, pangenome) return align_file, seq2pang @@ -462,19 +461,16 @@ def get_input_seq_to_family_with_all(pangenome: Pangenome, sequence_files: Unio :return: A tuple containing the path to the alignment result file, and a dictionary mapping input sequences to gene families. """ + pangenome_sequences = tmpdir / "nucleotide_genes.fna" + logging.getLogger("PPanGGOLiN").debug(f'Write all pangenome gene sequences in {pangenome_sequences.absolute()}') + write_all_gene_sequences(pangenome, pangenome_sequences, add="ppanggolin_", disable_bar=disable_bar) - with tempfile.NamedTemporaryFile(mode="w", dir=tmpdir.as_posix(), delete=False, - prefix="all_pangenome_genes", suffix=".fna") as tmp_pang_file: - logging.getLogger("PPanGGOLiN").debug(f'Write all pangenome gene sequences in {tmp_pang_file.name}') - pangenome_sequences = Path(tmp_pang_file.name) - write_all_gene_sequences(pangenome, tmp_pang_file, add="ppanggolin_", disable_bar=disable_bar) - - align_file = align_seq_to_pang(target_seq_file=pangenome_sequences, query_seq_files=sequence_files, tmpdir=tmpdir, - cpu=cpu, no_defrag=no_defrag, identity=identity, coverage=coverage, - is_query_nt=is_input_seq_nt, is_query_slf=is_input_slf, is_target_nt=True, - is_target_slf=True, translation_table=translation_table) + align_file = align_seq_to_pang(target_seq_file=pangenome_sequences, query_seq_files=sequence_files, tmpdir=tmpdir, + cpu=cpu, no_defrag=no_defrag, identity=identity, coverage=coverage, + is_query_nt=is_input_seq_nt, is_query_slf=is_input_slf, is_target_nt=True, + is_target_slf=True, translation_table=translation_table) - seq2pang, align_file = map_input_gene_to_family_all_aln(align_file, output, pangenome) + seq2pang, align_file = map_input_gene_to_family_all_aln(align_file, output, pangenome) return align_file, seq2pang diff --git a/ppanggolin/cluster/cluster.py b/ppanggolin/cluster/cluster.py index c7d3d1f3..faab8873 100644 --- a/ppanggolin/cluster/cluster.py +++ b/ppanggolin/cluster/cluster.py @@ -8,7 +8,7 @@ from collections import defaultdict import os import argparse -from typing import TextIO, Tuple, Dict, Set +from typing import Tuple, Dict, Set from pathlib import Path import time @@ -20,11 +20,10 @@ from ppanggolin.pangenome import Pangenome from ppanggolin.genome import Gene from ppanggolin.geneFamily import GeneFamily -from ppanggolin.utils import read_compressed_or_not, restricted_float +from ppanggolin.utils import read_compressed_or_not, restricted_float, create_tmpdir from ppanggolin.formats.writeBinaries import write_pangenome, erase_pangenome from ppanggolin.formats.readBinaries import check_pangenome_info, write_gene_sequences_from_pangenome_file from ppanggolin.formats.writeSequences import write_gene_sequences_from_annotations, translate_genes -from ppanggolin.utils import mk_outdir # Global functions @@ -43,14 +42,14 @@ def check_pangenome_former_clustering(pangenome: Pangenome, force: bool = False) # Clustering functions -def check_pangenome_for_clustering(pangenome: Pangenome, tmp_file: TextIO, force: bool = False, +def check_pangenome_for_clustering(pangenome: Pangenome, sequences: Path, force: bool = False, disable_bar: bool = False): """ Check the pangenome statuses and write the gene sequences in the provided tmpFile. (whether they are written in the .h5 file or currently in memory) :param pangenome: Annotated Pangenome - :param tmp_file: Temporary file + :param sequences: Path to write the sequences :param force: Force to write on existing pangenome information :param disable_bar: Allow to disable progress bar """ @@ -58,18 +57,19 @@ def check_pangenome_for_clustering(pangenome: Pangenome, tmp_file: TextIO, force if pangenome.status["geneSequences"] in ["Computed", "Loaded"]: logging.getLogger("PPanGGOLiN").debug("Write sequences from annotation loaded in pangenome") # we append the gene ids by 'ppanggolin' to avoid crashes from mmseqs when sequence IDs are only numeric. - write_gene_sequences_from_annotations(pangenome.genes, tmp_file, add="ppanggolin_", disable_bar=disable_bar) + write_gene_sequences_from_annotations(pangenome.genes, sequences, add="ppanggolin_", + compress=False, disable_bar=disable_bar) elif pangenome.status["geneSequences"] == "inFile": logging.getLogger("PPanGGOLiN").debug("Write sequences from pangenome file") - write_gene_sequences_from_pangenome_file(pangenome.file, tmp_file, add="ppanggolin_", - disable_bar=disable_bar) # write CDS sequences to the tmpFile + write_gene_sequences_from_pangenome_file(pangenome.file, sequences, add="ppanggolin_", + compress=False, disable_bar=disable_bar) # write CDS sequences to the tmpFile else: - tmp_file.close() # closing the tmp file since an exception will be raised. raise Exception("The pangenome does not include gene sequences, thus it is impossible to cluster " "the genes in gene families. Either provide clustering results (see --clusters), " "or provide a way to access the gene sequence during the annotation step " "(having the fasta in the gff files, or providing the fasta files through the --fasta option)") + def first_clustering(sequences: Path, tmpdir: Path, cpu: int = 1, code: int = 11, coverage: float = 0.8, identity: float = 0.8, mode: int = 1) -> Tuple[Path, Path]: """ @@ -85,7 +85,7 @@ def first_clustering(sequences: Path, tmpdir: Path, cpu: int = 1, code: int = 11 :return: path to representative sequence file and path to tsv clustering result """ - seqdb = translate_genes(sequences=sequences, db_name='aa_db', tmpdir=tmpdir, cpu=cpu, + seqdb = translate_genes(sequences=sequences, tmpdir=tmpdir, cpu=cpu, is_single_line_fasta=True, code=code) logging.getLogger("PPanGGOLiN").info("Clustering sequences...") cludb = tmpdir / 'cluster_db' @@ -294,34 +294,23 @@ def clustering(pangenome: Pangenome, tmpdir: Path, cpu: int = 1, defrag: bool = :param disable_bar: Disable the progress bar during clustering. :param keep_tmp_files: Keep temporary files (useful for debugging). """ - - if keep_tmp_files: - date = time.strftime("_%Y-%m-%d_%H-%M-%S", time.localtime()) - - dir_name = f'clustering_tmpdir_{date}_PID{os.getpid()}' - tmp_path = Path(tmpdir) / dir_name - mk_outdir(tmp_path, force=True) - else: - newtmpdir = tempfile.TemporaryDirectory(dir=tmpdir) - tmp_path = Path(newtmpdir.name) - - sequence_path = tmp_path/'nucleotid_sequences' - with open(sequence_path, "w") as sequence_file: - check_pangenome_for_clustering(pangenome, sequence_file, force, disable_bar=disable_bar) + date = time.strftime("_%Y-%m-%d_%H-%M-%S", time.localtime()) + dir_name = f'clustering_tmpdir_{date}_PID{os.getpid()}' + with create_tmpdir(tmpdir, basename=dir_name, keep_tmp=keep_tmp_files) as tmp_path: + sequence_path = tmp_path/'nucleotide_sequences.fna' + check_pangenome_for_clustering(pangenome, sequence_path, force, disable_bar=disable_bar) logging.getLogger("PPanGGOLiN").info("Clustering all of the genes sequences...") rep, tsv = first_clustering(sequence_path, tmp_path, cpu, code, coverage, identity, mode) - fam2seq = read_faa(rep) - if not defrag: - logging.getLogger("PPanGGOLiN").debug("No defragmentation") - genes2fam, _ = read_tsv(tsv) - else: - logging.getLogger("PPanGGOLiN").info("Associating fragments to their original gene family...") - aln = align_rep(rep, tmp_path, cpu, coverage, identity) - genes2fam, fam2seq = refine_clustering(tsv, aln, fam2seq) - pangenome.status["defragmented"] = "Computed" - if not keep_tmp_files: - newtmpdir.cleanup() + fam2seq = read_faa(rep) + if not defrag: + logging.getLogger("PPanGGOLiN").debug("No defragmentation") + genes2fam, _ = read_tsv(tsv) + else: + logging.getLogger("PPanGGOLiN").info("Associating fragments to their original gene family...") + aln = align_rep(rep, tmp_path, cpu, coverage, identity) + genes2fam, fam2seq = refine_clustering(tsv, aln, fam2seq) + pangenome.status["defragmented"] = "Computed" read_fam2seq(pangenome, fam2seq) read_gene2fam(pangenome, genes2fam, disable_bar=disable_bar) @@ -512,6 +501,7 @@ def parser_clust(parser: argparse.ArgumentParser): "with their original gene family.") clust.add_argument("--translation_table", required=False, default="11", help="Translation table (genetic code) to use.") + # clust.add_argument("--compress") clust.add_argument("-c", "--cpu", required=False, default=1, type=int, help="Number of available cpus") diff --git a/ppanggolin/formats/readBinaries.py b/ppanggolin/formats/readBinaries.py index bb0f02d5..4c90be2c 100644 --- a/ppanggolin/formats/readBinaries.py +++ b/ppanggolin/formats/readBinaries.py @@ -4,7 +4,7 @@ # default libraries import logging from pathlib import Path -from typing import TextIO, Dict, Any, Set, List, Tuple +from typing import Dict, Any, Set, List, Tuple from collections import defaultdict # installed libraries @@ -17,6 +17,7 @@ from ppanggolin.geneFamily import GeneFamily from ppanggolin.region import Region, Spot, Module from ppanggolin.metadata import Metadata +from ppanggolin.utils import write_compressed_or_not class Genedata: @@ -26,7 +27,7 @@ class Genedata: """ def __init__(self, start: int, stop: int, strand: str, gene_type: str, position: int, name: str, product: str, - genetic_code: int, coordinates:List[Tuple[int]] = None): + genetic_code: int, coordinates: List[Tuple[int]] = None): """Constructor method :param start: Gene start position @@ -60,6 +61,7 @@ def __eq__(self, other): and self.genetic_code == other.genetic_code \ and self.coordinates == other.coordinates \ + def __hash__(self): return hash((self.start, self.stop, self.strand, self.gene_type, self.position, self.name, self.product, self.genetic_code, tuple(self.coordinates))) @@ -172,9 +174,10 @@ def read_genedata(h5f: tables.File) -> Dict[int, Genedata]: for row in read_chunks(table, chunk=20000): start = int(row["start"]) stop = int(row["stop"]) - - if "has_joined_coordinates" in row.dtype.names and row["has_joined_coordinates"]: # manage gene with joined coordinates if the info exists - + + if "has_joined_coordinates" in row.dtype.names and row["has_joined_coordinates"]: + # manage gene with joined coordinates if the info exists + try: coordinates = genedata_id_to_coordinates[row["genedata_id"]] except KeyError: @@ -183,7 +186,6 @@ def read_genedata(h5f: tables.File) -> Dict[int, Genedata]: else: coordinates = [(start, stop)] - genedata = Genedata(start=start, stop=stop, strand=row["strand"].decode(), @@ -199,6 +201,7 @@ def read_genedata(h5f: tables.File) -> Dict[int, Genedata]: return genedata_id2genedata + def read_join_coordinates(h5f: tables.File) -> Dict[str, List[Tuple[int, int]]]: """ Read join coordinates from a HDF5 file and return a dictionary mapping genedata_id to coordinates. @@ -207,7 +210,7 @@ def read_join_coordinates(h5f: tables.File) -> Dict[str, List[Tuple[int, int]]]: :return: A dictionary mapping genedata_id to a list of tuples representing start and stop coordinates. """ genedata_id_to_coordinates = defaultdict(list) - + if not hasattr(h5f.root.annotations, "joinedCoordinates"): # then the pangenome file has no joined annotations # or has been made before the joined annotations coordinates @@ -218,7 +221,8 @@ def read_join_coordinates(h5f: tables.File) -> Dict[str, List[Tuple[int, int]]]: for row in read_chunks(table, chunk=20000): genedata_id = row["genedata_id"] - genedata_id_to_coordinates[genedata_id].append((int(row["coordinate_rank"]), int(row["start"]), int(row["stop"]))) + genedata_id_to_coordinates[genedata_id].append( + (int(row["coordinate_rank"]), int(row["start"]), int(row["stop"]))) # sort coordinate by their rank genedata_id_to_sorted_coordinates = {} @@ -242,21 +246,21 @@ def read_sequences(h5f: tables.File) -> dict: return seqid2seq -def get_non_redundant_gene_sequences_from_file(pangenome_filename: str, file_obj: TextIO, add: str = '', +def get_non_redundant_gene_sequences_from_file(pangenome_filename: str, output: Path, add: str = '', disable_bar: bool = False): """ Writes the non-redundant CDS sequences of the Pangenome object to a File object that can be filtered or not by a list of CDS, and adds the eventual str 'add' in front of the identifiers. Loads the sequences from a .h5 pangenome file. :param pangenome_filename: Name of the pangenome file - :param file_obj: Name of the output file + :param output: Path to the output file :param add: Add a prefix to sequence header :param disable_bar: disable progress bar """ - logging.getLogger("PPanGGOLiN").info( - f"Extracting and writing non redundant CDS sequences from {pangenome_filename} to {file_obj.name}") + logging.getLogger("PPanGGOLiN").info(f"Extracting and writing non redundant CDS sequences from {pangenome_filename}" + f" to {output.absolute()}") with tables.open_file(pangenome_filename, "r", driver_core_backing_store=0) as h5f: @@ -269,41 +273,41 @@ def get_non_redundant_gene_sequences_from_file(pangenome_filename: str, file_obj seqid2cds_name[row["seqid"]] = row["gene"].decode() table = h5f.root.annotations.sequences - for row in tqdm(read_chunks(table, chunk=20000), total=table.nrows, unit="gene", disable=disable_bar): - cds_name = seqid2cds_name[row["seqid"]] - file_obj.write(f'>{add}{cds_name}\n') - file_obj.write(f'{row["dna"].decode()}\n') - - file_obj.flush() + with open(output, "w") as file_obj: + for row in tqdm(read_chunks(table, chunk=20000), total=table.nrows, unit="gene", disable=disable_bar): + cds_name = seqid2cds_name[row["seqid"]] + file_obj.write(f'>{add}{cds_name}\n') + file_obj.write(f'{row["dna"].decode()}\n') -def write_gene_sequences_from_pangenome_file(pangenome_filename: str, file_obj: TextIO, list_cds: iter = None, - add: str = '', disable_bar: bool = False): +def write_gene_sequences_from_pangenome_file(pangenome_filename: str, output: Path, list_cds: iter = None, + add: str = '', compress: bool = False, disable_bar: bool = False): """ Writes the CDS sequences of the Pangenome object to a File object that can be filtered or not by a list of CDS, and adds the eventual str 'add' in front of the identifiers. Loads the sequences from a .h5 pangenome file. :param pangenome_filename: Name of the pangenome file - :param file_obj: Name of the output file + :param output: Path to the sequences file :param list_cds: An iterable object of CDS :param add: Add a prefix to sequence header + :param compress: Compress the output file :param disable_bar: Prevent to print disable progress bar """ - logging.getLogger("PPanGGOLiN").info( - f"Extracting and writing CDS sequences from a {pangenome_filename} file to a fasta file...") - h5f = tables.open_file(pangenome_filename, "r", driver_core_backing_store=0) - - table = h5f.root.annotations.geneSequences - list_cds = set(list_cds) if list_cds is not None else None - seqid2seq = read_sequences(h5f) - for row in tqdm(read_chunks(table, chunk=20000), total=table.nrows, unit="gene", disable=disable_bar): - # Read the table chunk per chunk otherwise RAM dies on big pangenomes - name_cds = row["gene"].decode() - if row["type"] == b"CDS" and (list_cds is None or name_cds in list_cds): - file_obj.write('>' + add + name_cds + "\n") - file_obj.write(seqid2seq[row["seqid"]] + "\n") - file_obj.flush() - h5f.close() + logging.getLogger("PPanGGOLiN").info(f"Extracting and writing CDS sequences from a {pangenome_filename} " + "file to a fasta file...") + with tables.open_file(pangenome_filename, "r", driver_core_backing_store=0) as h5f: + table = h5f.root.annotations.geneSequences + list_cds = set(list_cds) if list_cds is not None else None + seqid2seq = read_sequences(h5f) + with write_compressed_or_not(output, compress) as file_obj: + for row in tqdm(read_chunks(table, chunk=20000), total=table.nrows, unit="gene", disable=disable_bar): + # Read the table chunk per chunk otherwise RAM dies on big pangenomes + name_cds = row["gene"].decode() + if row["type"] == b"CDS" and (list_cds is None or name_cds in list_cds): + file_obj.write('>' + add + name_cds + "\n") + file_obj.write(seqid2seq[row["seqid"]] + "\n") + logging.getLogger("PPanGGOLiN").debug("Gene sequences from pangenome file was written to " + f"{output.absolute()}{'.gz' if compress else ''}") def read_graph(pangenome: Pangenome, h5f: tables.File, disable_bar: bool = False): @@ -520,7 +524,8 @@ def read_genes(pangenome: Pangenome, table: tables.Table, genedata_dict: Dict[in local = "" gene.fill_annotations(start=genedata.start, stop=genedata.stop, strand=genedata.strand, gene_type=genedata.gene_type, name=genedata.name, position=genedata.position, - genetic_code=genedata.genetic_code, product=genedata.product, local_identifier=local, coordinates=genedata.coordinates) + genetic_code=genedata.genetic_code, product=genedata.product, local_identifier=local, + coordinates=genedata.coordinates) gene.is_fragment = row["is_fragment"] if link: contig = pangenome.get_contig(identifier=int(row["contig"])) diff --git a/ppanggolin/formats/writeSequences.py b/ppanggolin/formats/writeSequences.py index bc88ec20..272e1a8a 100644 --- a/ppanggolin/formats/writeSequences.py +++ b/ppanggolin/formats/writeSequences.py @@ -6,9 +6,8 @@ import logging import re import subprocess -import sys from pathlib import Path -from typing import TextIO, Dict, Set, Iterable, Union +from typing import Dict, Set, Iterable, Union import tempfile import shutil @@ -19,7 +18,8 @@ from ppanggolin.pangenome import Pangenome from ppanggolin.geneFamily import GeneFamily from ppanggolin.genome import Gene, Organism -from ppanggolin.utils import write_compressed_or_not, mk_outdir, read_compressed_or_not, restricted_float, detect_filetype +from ppanggolin.utils import (write_compressed_or_not, mk_outdir, create_tmpdir, read_compressed_or_not, + restricted_float, detect_filetype) from ppanggolin.formats.readBinaries import check_pangenome_info, write_gene_sequences_from_pangenome_file module_regex = re.compile(r'^module_\d+') # \d == [0-9] @@ -115,24 +115,25 @@ def check_pangenome_to_write_sequences(pangenome: Pangenome, regions: str = None need_spots=need_spots, need_modules=need_modules, disable_bar=disable_bar) -def write_gene_sequences_from_annotations(genes_to_write: Iterable[Gene], file_obj: TextIO, add: str = '', - disable_bar: bool = False): +def write_gene_sequences_from_annotations(genes_to_write: Iterable[Gene], output: Path, add: str = '', + compress: bool = False, disable_bar: bool = False): """ Writes the CDS sequences to a File object, and adds the string provided through `add` in front of it. Loads the sequences from previously computed or loaded annotations. :param genes_to_write: Genes to write. - :param file_obj: Output file to write sequences. + :param output: Path to output file to write sequences. :param add: Add prefix to gene ID. + :param compress: Compress the file in .gz :param disable_bar: Disable progress bar. """ - logging.getLogger("PPanGGOLiN").info(f"Writing all CDS sequences in {file_obj.name}") - for gene in tqdm(genes_to_write, unit="gene", disable=disable_bar): - if gene.type == "CDS": - file_obj.write(f'>{add}{gene.ID}\n') - file_obj.write(f'{gene.dna}\n') - file_obj.flush() + logging.getLogger("PPanGGOLiN").info(f"Writing all CDS sequences in {output.absolute()}") + with write_compressed_or_not(output, compress) as file_obj: + for gene in tqdm(genes_to_write, unit="gene", disable=disable_bar): + if gene.type == "CDS": + file_obj.write(f'>{add}{gene.ID}\n') + file_obj.write(f'{gene.dna}\n') def create_mmseqs_db(sequences: Iterable[Path], db_name: str, tmpdir: Path, db_mode: int = 0, db_type: int = 0) -> Path: @@ -159,12 +160,11 @@ def create_mmseqs_db(sequences: Iterable[Path], db_name: str, tmpdir: Path, db_m return seq_nucdb -def translate_genes(sequences: Union[Path, Iterable[Path]], db_name: str, tmpdir: Path, cpu: int = 1, +def translate_genes(sequences: Union[Path, Iterable[Path]], tmpdir: Path, cpu: int = 1, is_single_line_fasta: bool = False, code: int = 11) -> Path: """Translate nucleotide sequences into MMSeqs2 amino acid sequences database :param sequences: File with the nucleotide sequences - :param db_name: name of the output database :param tmpdir: Temporary directory to save the MMSeqs2 files :param cpu: Number of available threads to use :param is_single_line_fasta: Allow to use soft link in MMSeqs2 database @@ -172,10 +172,10 @@ def translate_genes(sequences: Union[Path, Iterable[Path]], db_name: str, tmpdir :return: Path to the MMSeqs2 database """ - seq_nucdb = create_mmseqs_db([sequences] if isinstance(sequences, Path) else sequences, f'{db_name}_nt_db', + seq_nucdb = create_mmseqs_db([sequences] if isinstance(sequences, Path) else sequences, 'nucleotides_db', tmpdir, db_mode=1 if is_single_line_fasta else 0, db_type=2) logging.getLogger("PPanGGOLiN").debug("Translate sequence ...") - seqdb = tmpdir / db_name + seqdb = tmpdir / 'translate_db' cmd = list(map(str, ["mmseqs", "translatenucs", seq_nucdb, seqdb, "--threads", cpu, "--translation-table", code])) logging.getLogger("PPanGGOLiN").debug(" ".join(cmd)) subprocess.run(cmd, stdout=subprocess.DEVNULL, check=True) @@ -183,7 +183,7 @@ def translate_genes(sequences: Union[Path, Iterable[Path]], db_name: str, tmpdir def write_gene_sequences(pangenome: Pangenome, output: Path, genes: str, soft_core: float = 0.95, - compress: bool = False, disable_bar: bool = False): + compress: bool = False, disable_bar: bool = False) -> Path: """ Write all nucleotide CDS sequences @@ -213,16 +213,16 @@ def write_gene_sequences(pangenome: Pangenome, output: Path, genes: str, soft_co genes_to_write.extend(fam.genes) logging.getLogger("PPanGGOLiN").info(f"There are {len(genes_to_write)} genes to write") - with write_compressed_or_not(outpath, compress) as fasta: - if pangenome.status["geneSequences"] in ["inFile"]: - write_gene_sequences_from_pangenome_file(pangenome.file, fasta, set([gene.ID for gene in genes_to_write]), - disable_bar=disable_bar) - elif pangenome.status["geneSequences"] in ["Computed", "Loaded"]: - write_gene_sequences_from_annotations(genes_to_write, fasta, disable_bar=disable_bar) - else: - # this should never happen if the pangenome has been properly checked before launching this function. - raise AttributeError("The pangenome does not include gene sequences") - logging.getLogger("PPanGGOLiN").info(f"Done writing the gene sequences : '{outpath}'") + if pangenome.status["geneSequences"] in ["inFile"]: + write_gene_sequences_from_pangenome_file(pangenome.file, outpath, set([gene.ID for gene in genes_to_write]), + compress=compress, disable_bar=disable_bar) + elif pangenome.status["geneSequences"] in ["Computed", "Loaded"]: + write_gene_sequences_from_annotations(genes_to_write, outpath, compress=compress, disable_bar=disable_bar) + else: + # this should never happen if the pangenome has been properly checked before launching this function. + raise AttributeError("The pangenome does not include gene sequences") + logging.getLogger("PPanGGOLiN").info(f"Done writing the gene sequences : '{outpath}{'.gz' if compress else ''}'") + return outpath def write_gene_protein_sequences(pangenome: Pangenome, output: Path, proteins: str, soft_core: float = 0.95, @@ -241,23 +241,26 @@ def write_gene_protein_sequences(pangenome: Pangenome, output: Path, proteins: s :param code: Genetic code use to translate nucleotide sequences to protein sequences :param disable_bar: Disable progress bar """ - tmpdir = tmp / "translateGenes" if tmp is not None else Path(f"{tempfile.gettempdir()}/translateGenes") - mk_outdir(tmpdir, True, True) - - write_gene_sequences(pangenome, tmpdir, proteins, soft_core, compress, disable_bar) - - pangenome_sequences = tmpdir / f"{proteins}_genes.fna" - translate_db = translate_genes(sequences=pangenome_sequences, db_name='aa_db', tmpdir=tmpdir, - cpu=cpu, is_single_line_fasta=True, code=code) - outpath = output / f"{proteins}_protein_genes.fna" - cmd = list(map(str, ["mmseqs", "convert2fasta", translate_db, outpath])) - logging.getLogger("PPanGGOLiN").debug(" ".join(cmd)) - subprocess.run(cmd, stdout=subprocess.DEVNULL, check=True) - logging.getLogger("PPanGGOLiN").info(f"Done writing the gene sequences : '{outpath}'") - - if not keep_tmp: - logging.getLogger("PPanGGOLiN").debug("Clean temporary directory") - shutil.rmtree(tmpdir) + with create_tmpdir(tmp if tmp is not None else Path(tempfile.gettempdir()), + basename="translateGenes", keep_tmp=keep_tmp) as tmpdir: + + write_gene_sequences(pangenome, tmpdir, proteins, soft_core, compress, disable_bar) + + pangenome_sequences = tmpdir / f"{proteins}_genes.fna{'.gz' if compress else ''}" + translate_db = translate_genes(sequences=pangenome_sequences, tmpdir=tmpdir, + cpu=cpu, is_single_line_fasta=True, code=code) + outpath = output / f"{proteins}_protein_genes.fna" + cmd = list(map(str, ["mmseqs", "convert2fasta", translate_db, outpath])) + logging.getLogger("PPanGGOLiN").debug(" ".join(cmd)) + subprocess.run(cmd, stdout=subprocess.DEVNULL, check=True) + if compress: + with write_compressed_or_not(outpath, compress) as compress_file: + with open(outpath, "r") as sequence_file: + shutil.copyfileobj(sequence_file, compress_file) + outpath.unlink() + logging.getLogger("PPanGGOLiN").info(f"Done writing the gene sequences : '{outpath}.gz'") + else: + logging.getLogger("PPanGGOLiN").info(f"Done writing the gene sequences : '{outpath}'") def select_families(pangenome: Pangenome, partition: str, type_name: str, soft_core: float = 0.95) -> Set[GeneFamily]: @@ -331,12 +334,11 @@ def write_fasta_gene_fam(pangenome: Pangenome, output: Path, gene_families: str, genefams = select_families(pangenome, gene_families, "representative nucleotide sequences of the gene families", soft_core) - with write_compressed_or_not(outpath, compress) as fasta: - write_gene_sequences_from_pangenome_file(pangenome.file, fasta, [fam.name for fam in genefams], - disable_bar=disable_bar) + write_gene_sequences_from_pangenome_file(pangenome.file, outpath, [fam.name for fam in genefams], + compress=compress, disable_bar=disable_bar) - logging.getLogger("PPanGGOLiN").info( - f"Done writing the representative nucleotide sequences of the gene families : '{outpath}'") + logging.getLogger("PPanGGOLiN").info("Done writing the representative nucleotide sequences " + f"of the gene families : '{outpath}{'.gz' if compress else ''}") def write_fasta_prot_fam(pangenome: Pangenome, output: Path, prot_families: str, soft_core: float = 0.95, @@ -361,8 +363,8 @@ def write_fasta_prot_fam(pangenome: Pangenome, output: Path, prot_families: str, for fam in tqdm(genefams, unit="prot families", disable=disable_bar): fasta.write('>' + fam.name + "\n") fasta.write(fam.sequence + "\n") - logging.getLogger("PPanGGOLiN").info( - f"Done writing the representative amino acid sequences of the gene families : '{outpath}'") + logging.getLogger("PPanGGOLiN").info(f"Done writing the representative amino acid sequences of the gene families:" + f"'{outpath}{'.gz' if compress else ''}'") def read_fasta_or_gff(file_path: Path) -> Dict[str, str]: @@ -526,7 +528,8 @@ def write_regions_sequences(pangenome: Pangenome, output: Path, regions: str, fa fasta.write(f">{region.name}\n") fasta.write( write_spaced_fasta(genome_sequence[region.contig.name][region.start:region.stop], 60)) - logging.getLogger("PPanGGOLiN").info(f"Done writing the regions nucleotide sequences: '{outname}'") + logging.getLogger("PPanGGOLiN").info(f"Done writing the regions nucleotide sequences: " + f"'{outname}{'.gz' if compress else ''}'") def write_sequence_files(pangenome: Pangenome, output: Path, fasta: Path = None, anno: Path = None, diff --git a/ppanggolin/projection/projection.py b/ppanggolin/projection/projection.py index cddfd338..c06331e9 100644 --- a/ppanggolin/projection/projection.py +++ b/ppanggolin/projection/projection.py @@ -19,7 +19,6 @@ from tqdm import tqdm import networkx as nx import yaml -import pandas as pd # # local libraries from ppanggolin.annotate.synta import read_fasta, get_dna_sequence @@ -39,8 +38,10 @@ from ppanggolin.genome import Organism from ppanggolin.geneFamily import GeneFamily from ppanggolin.region import Region, Spot, Module -from ppanggolin.formats.writeFlatGenomes import write_proksee_organism, manage_module_colors, write_gff_file, write_tsv_genome_file -from ppanggolin.formats.writeFlatPangenome import summarize_spots, summarize_genome, write_summaries_in_tsv, write_rgp_table +from ppanggolin.formats.writeFlatGenomes import write_proksee_organism, manage_module_colors, write_gff_file, \ + write_tsv_genome_file +from ppanggolin.formats.writeFlatPangenome import summarize_spots, summarize_genome, write_summaries_in_tsv, \ + write_rgp_table from ppanggolin.formats.writeSequences import read_genome_file @@ -53,7 +54,8 @@ class NewSpot(Spot): def __str__(self): return f'new_spot_{str(self.ID)}' -def check_pangenome_for_projection(pangenome: Pangenome, fast_aln:bool): + +def check_pangenome_for_projection(pangenome: Pangenome, fast_aln: bool): """ Check the status of a pangenome and determine whether projection is possible. @@ -76,7 +78,6 @@ def check_pangenome_for_projection(pangenome: Pangenome, fast_aln:bool): predict_rgp = True project_spots = True - if pangenome.status["partitioned"] not in ["Computed", "Loaded", "inFile"]: raise NameError("The provided pangenome has not been partitioned. " "Annotation of an external genome is therefore not possible. " @@ -99,7 +100,7 @@ def check_pangenome_for_projection(pangenome: Pangenome, fast_aln:bool): "Projection of modules into the provided genome will not be performed.") project_modules = False - + if pangenome.status["geneSequences"] not in ["Loaded", "Computed", "inFile"] and not fast_aln: raise Exception("The provided pangenome has no gene sequences. " "Projection is still possible with the --fast option to use representative " @@ -108,22 +109,19 @@ def check_pangenome_for_projection(pangenome: Pangenome, fast_aln:bool): if pangenome.status["geneFamilySequences"] not in ["Loaded", "Computed", "inFile"]: raise Exception("The provided pangenome has no gene families sequences. " "This is not possible to annotate an input genome to this pangenome.") - + return predict_rgp, project_spots, project_modules -def manage_input_genomes_annotation(pangenome, input_mode, anno, fasta, - organism_name, circular_contigs, pangenome_params, - cpu, use_pseudo, disable_bar, tmpdir, config): - """ - """ +def manage_input_genomes_annotation(pangenome, input_mode, anno, fasta, organism_name, circular_contigs, + pangenome_params, cpu, use_pseudo, disable_bar, tmpdir, config): genome_name_to_path = None if input_mode == "multiple": if anno: input_type = "annotation" genome_name_to_path = parse_input_paths_file(anno) - + elif fasta: input_type = "fasta" genome_name_to_path = parse_input_paths_file(fasta) @@ -134,19 +132,19 @@ def manage_input_genomes_annotation(pangenome, input_mode, anno, fasta, if anno: input_type = "annotation" genome_name_to_path = {organism_name: {"path": anno, - "circular_contigs": circular_contigs}} - + "circular_contigs": circular_contigs}} + elif fasta: input_type = "fasta" genome_name_to_path = {organism_name: {"path": fasta, - "circular_contigs": circular_contigs}} + "circular_contigs": circular_contigs}} if input_type == "annotation": check_input_names(pangenome, genome_name_to_path) organisms, org_2_has_fasta = read_annotation_files(genome_name_to_path, cpu=cpu, pseudo=use_pseudo, - disable_bar=disable_bar) - + disable_bar=disable_bar) + if not all((has_fasta for has_fasta in org_2_has_fasta.values())): organisms_with_no_fasta = {org for org, has_fasta in org_2_has_fasta.items() if not has_fasta} if fasta: @@ -154,9 +152,9 @@ def manage_input_genomes_annotation(pangenome, input_mode, anno, fasta, else: raise ValueError(f"You provided GFF files for {len(organisms_with_no_fasta)} (out of {len(organisms)}) " - "genomes without associated sequence data, and you did not provide " - "FASTA sequences using the --fasta or --single_fasta_file options. Therefore, it is impossible to project the pangenome onto the input genomes. " - f"The following genomes have no associated sequence data: {', '.join(o.name for o in organisms_with_no_fasta)}") + "genomes without associated sequence data, and you did not provide " + "FASTA sequences using the --fasta or --single_fasta_file options. Therefore, it is impossible to project the pangenome onto the input genomes. " + f"The following genomes have no associated sequence data: {', '.join(o.name for o in organisms_with_no_fasta)}") elif input_type == "fasta": annotate_param_names = ["norna", "kingdom", @@ -165,22 +163,25 @@ def manage_input_genomes_annotation(pangenome, input_mode, anno, fasta, annotate_params = manage_annotate_param(annotate_param_names, pangenome_params.annotate, config) check_input_names(pangenome, genome_name_to_path) - organisms = annotate_fasta_files(genome_name_to_fasta_path=genome_name_to_path, tmpdir=tmpdir, cpu=cpu, - translation_table=int(pangenome_params.cluster.translation_table), norna=annotate_params.norna, kingdom=annotate_params.kingdom, - allow_overlap=annotate_params.allow_overlap, procedure=annotate_params.prodigal_procedure, disable_bar=disable_bar) + organisms = annotate_fasta_files(genome_name_to_fasta_path=genome_name_to_path, tmpdir=tmpdir, cpu=cpu, + translation_table=int(pangenome_params.cluster.translation_table), + norna=annotate_params.norna, kingdom=annotate_params.kingdom, + allow_overlap=annotate_params.allow_overlap, + procedure=annotate_params.prodigal_procedure, disable_bar=disable_bar) return organisms, genome_name_to_path, input_type -def write_projection_results(pangenome:Pangenome, organisms:Set[Organism], input_org_2_rgps:Dict[Organism, Set[Region]], - input_org_to_spots:Dict[Organism, Set[Spot]], - input_orgs_to_modules:Dict[Organism, Set[Module]] , - input_org_to_lonely_genes_count:Dict[Organism, int], - write_proksee:bool, write_gff:bool, write_table:bool, - add_sequences:bool, - genome_name_to_path:Dict[str,dict], input_type:str, - output_dir:Path, dup_margin:float, soft_core:float, - metadata_sep:str, compress:bool, - need_regions:bool, need_spots:bool, need_modules:bool): +def write_projection_results(pangenome: Pangenome, organisms: Set[Organism], + input_org_2_rgps: Dict[Organism, Set[Region]], + input_org_to_spots: Dict[Organism, Set[Spot]], + input_orgs_to_modules: Dict[Organism, Set[Module]], + input_org_to_lonely_genes_count: Dict[Organism, int], + write_proksee: bool, write_gff: bool, write_table: bool, + add_sequences: bool, + genome_name_to_path: Dict[str, dict], input_type: str, + output_dir: Path, dup_margin: float, soft_core: float, + metadata_sep: str, compress: bool, + need_regions: bool, need_spots: bool, need_modules: bool): """ Write the results of the projection of pangneome onto input genomes. @@ -214,14 +215,12 @@ def write_projection_results(pangenome:Pangenome, organisms:Set[Organism], input # single_copy_families = get_single_copy_families(pangenome, dup_margin) # multigenics = pangenome.get_multigenics(pangenome_params.rgp.dup_margin) - - - # dup margin value here is specified in argument and is used to compute completeness. + # dup margin value here is specified in argument and is used to compute completeness. # Thats mean it can be different than dup margin used in spot and RGPS. - pangenome_persistent_single_copy_families = pangenome.get_single_copy_persistent_families(dup_margin = dup_margin, exclude_fragments=True) + pangenome_persistent_single_copy_families = pangenome.get_single_copy_persistent_families(dup_margin=dup_margin, + exclude_fragments=True) pangenome_persistent_count = len([fam for fam in pangenome.gene_families if fam.named_partition == "persistent"]) - soft_core_families = pangenome.soft_core_families(soft_core) exact_core_families = pangenome.exact_core_families() @@ -234,17 +233,17 @@ def write_projection_results(pangenome:Pangenome, organisms:Set[Organism], input singleton_gene_count = input_org_to_lonely_genes_count[organism] org_summary = summarize_projected_genome(organism, - pangenome_persistent_count, - pangenome_persistent_single_copy_families, - soft_core_families=soft_core_families, - exact_core_families=exact_core_families, - input_org_rgps=input_org_2_rgps.get(organism, None), - input_org_spots=input_org_to_spots.get(organism, None), - input_org_modules=input_orgs_to_modules.get(organism, None), - pangenome_file=pangenome.file, - singleton_gene_count=singleton_gene_count) + pangenome_persistent_count, + pangenome_persistent_single_copy_families, + soft_core_families=soft_core_families, + exact_core_families=exact_core_families, + input_org_rgps=input_org_2_rgps.get(organism, None), + input_org_spots=input_org_to_spots.get(organism, None), + input_org_modules=input_orgs_to_modules.get(organism, None), + pangenome_file=pangenome.file, + singleton_gene_count=singleton_gene_count) summaries.append(org_summary) - + yaml_outputfile = output_dir / organism.name / "projection_summary.yaml" write_summary_in_yaml(org_summary, yaml_outputfile) @@ -264,7 +263,7 @@ def write_projection_results(pangenome:Pangenome, organisms:Set[Organism], input genome_sequences=genome_sequences, compress=compress) if write_gff: - if input_type == "annotation": # if the genome has not been annotated by PPanGGOLiN + if input_type == "annotation": # if the genome has not been annotated by PPanGGOLiN annotation_sources = {"rRNA": "external", "tRNA": "external", "CDS": "external"} @@ -278,14 +277,13 @@ def write_projection_results(pangenome:Pangenome, organisms:Set[Organism], input if write_table: write_tsv_genome_file(organism, output_dir / organism.name, compress=compress, metadata_sep=metadata_sep, - need_regions=need_regions, need_spots=need_spots, need_modules=need_modules) + need_regions=need_regions, need_spots=need_spots, need_modules=need_modules) output_file = output_dir / "summary_projection.tsv" write_summaries_in_tsv(summaries, output_file=output_file, dup_margin=dup_margin, soft_core=soft_core) - def summarize_projected_genome(organism: Organism, @@ -494,7 +492,7 @@ def check_input_names(pangenome, input_names): f"{len(duplicated_names)} provided genome name(s) already exist in the given pangenome: {' '.join(duplicated_names)}") -def write_summary_in_yaml(summary_info: Dict[str, Any], output_file: Path): +def write_summary_in_yaml(summary_info: Dict[str, Any], output_file: Path): """ Write summary information to a YAML file. @@ -540,23 +538,25 @@ def annotate_input_genes_with_pangenome_families(pangenome: Pangenome, input_org logging.getLogger('PPanGGOLiN').info('Writing gene sequences of input genomes.') - for input_organism in input_organisms: - seq_outdir = output / input_organism.name - mk_outdir(seq_outdir, force=True) - - seq_fasta_file = seq_outdir / "cds_sequences.fasta" - - with open(seq_fasta_file, "w") as fh_out_faa: - write_gene_sequences_from_annotations(input_organism.genes, fh_out_faa, disable_bar=True, - add="ppanggolin_") + input_genes = [gene for org in input_organisms for gene in org.genes] - seq_fasta_files.append(seq_fasta_file) + # for input_organism in input_organisms: + # seq_outdir = output / input_organism.name + # mk_outdir(seq_outdir, force=True) + # + # seq_fasta_file = seq_outdir / "cds_sequences.fasta" + # + # write_gene_sequences_from_annotations(input_organism.genes, seq_fasta_file, disable_bar=True, add="ppanggolin_") + # + # seq_fasta_files.append(seq_fasta_file) + seq_fasta_file = output / 'input_genes.fasta' - with create_tmpdir(main_dir=tmpdir, basename="align_input_seq_tmpdir", keep_tmp=keep_tmp) as new_tmpdir: + write_gene_sequences_from_annotations(input_genes, seq_fasta_file, disable_bar=True, add='ppanggolin_') + with create_tmpdir(main_dir=tmpdir, basename="projection_tmp", keep_tmp=keep_tmp) as new_tmpdir: if use_representatives: _, seqid_to_gene_family = get_input_seq_to_family_with_rep(pangenome=pangenome, - sequence_files=seq_fasta_files, + sequence_files=seq_fasta_file, output=new_tmpdir, tmpdir=new_tmpdir, is_input_seq_nt=True, is_input_slf=True, cpu=cpu, no_defrag=no_defrag, @@ -564,16 +564,18 @@ def annotate_input_genes_with_pangenome_families(pangenome: Pangenome, input_org translation_table=translation_table) else: _, seqid_to_gene_family = get_input_seq_to_family_with_all(pangenome=pangenome, - sequence_files=seq_fasta_files, + sequence_files=seq_fasta_file, output=new_tmpdir, tmpdir=new_tmpdir, is_input_seq_nt=True, is_input_slf=True, cpu=cpu, no_defrag=no_defrag, identity=identity, coverage=coverage, translation_table=translation_table, disable_bar=disable_bar) + input_org_to_lonely_genes_count = {} for input_organism in input_organisms: org_outdir = output / input_organism.name + mk_outdir(org_outdir, force=True) seq_set = {gene.ID if gene.local_identifier == "" else gene.local_identifier for gene in input_organism.genes} @@ -606,7 +608,7 @@ def annotate_input_genes_with_pangenome_families(pangenome: Pangenome, input_org gene_family = GeneFamily(pangenome.max_fam_id, new_name) pangenome.add_gene_family(gene_family) - + gene_family.partition = "Cloud" lonely_genes.add(gene) @@ -616,7 +618,7 @@ def annotate_input_genes_with_pangenome_families(pangenome: Pangenome, input_org "The input genome contains a gene that aligns to a pangenome family " f"which already contains a gene with the same ID ({gene_id}). " f"The genome name has been appended to the family name: {new_name}") - + gene.ID = new_name # Add the gene to the gene family @@ -624,7 +626,6 @@ def annotate_input_genes_with_pangenome_families(pangenome: Pangenome, input_org pangenome._mk_gene_getter() # re-build the gene getter - logging.getLogger('PPanGGOLiN').info( f"{input_organism.name} has {len(lonely_genes)}/{input_organism.number_of_genes()} " "specific genes that do not align to any gene of the pangenome.") @@ -923,7 +924,7 @@ def predict_spot_in_one_organism( overlapping_match: int = 2, set_size: int = 3, exact_match: int = 1, - compress:bool = False) -> Set[Spot]: + compress: bool = False) -> Set[Spot]: """ Predict spots for input organism RGPs. @@ -960,10 +961,11 @@ def predict_spot_in_one_organism( input_org_node_to_rgps[border_node].add(rgp) if len(input_org_node_to_rgps) == 0: - logging.getLogger("PPanGGOLiN").debug(f"{organism_name}: no RGPs of the input genome will be associated with any spot of insertion " - "as they are on a contig border (or have " - f"less than {set_size} persistent gene families until the contig border). " - "Projection of spots stops here") + logging.getLogger("PPanGGOLiN").debug( + f"{organism_name}: no RGPs of the input genome will be associated with any spot of insertion " + "as they are on a contig border (or have " + f"less than {set_size} persistent gene families until the contig border). " + "Projection of spots stops here") return set() # remove node that were already in the graph @@ -1048,7 +1050,7 @@ def predict_spot_in_one_organism( filename='input_genome_rgp_to_spot.tsv', compress=compress) input_org_spots = {spot for spots in input_rgp_to_spots.values() - for spot in spots } + for spot in spots} new_spots = {spot for spot in input_org_spots if isinstance(spot, NewSpot)} logging.getLogger('PPanGGOLiN').debug( @@ -1189,8 +1191,6 @@ def check_projection_arguments(args: argparse.Namespace, parser: argparse.Argume return input_mode - - def launch(args: argparse.Namespace): """ Command launcher @@ -1208,41 +1208,43 @@ def launch(args: argparse.Namespace): predict_rgp, project_spots, project_modules = check_pangenome_for_projection(pangenome, args.fast) - need_graph =True if args.table else False + need_graph = True if args.table else False check_pangenome_info(pangenome, need_annotations=True, need_families=True, disable_bar=args.disable_prog_bar, need_rgp=predict_rgp, need_modules=project_modules, need_gene_sequences=False, need_spots=project_spots, need_graph=need_graph) - + logging.getLogger('PPanGGOLiN').info('Retrieving parameters from the provided pangenome file.') pangenome_params = argparse.Namespace( **{step: argparse.Namespace(**k_v) for step, k_v in pangenome.parameters.items()}) if predict_rgp: - #computing multigenics for rgp prediction first to have original family.number_of_genomes + # computing multigenics for rgp prediction first to have original family.number_of_genomes # and the same multigenics list as when rgp and spot were predicted multigenics = pangenome.get_multigenics(pangenome_params.rgp.dup_margin) - - organisms, genome_name_to_path, input_type = manage_input_genomes_annotation(pangenome=pangenome, - input_mode=args.input_mode, - anno=args.anno, fasta=args.fasta, - organism_name=args.genome_name, - circular_contigs=args.circular_contigs, - pangenome_params=pangenome_params, - cpu=args.cpu, use_pseudo=args.use_pseudo, - disable_bar=args.disable_prog_bar, - tmpdir= args.tmpdir, config=args.config) - - + + organisms, genome_name_to_path, input_type = manage_input_genomes_annotation(pangenome=pangenome, + input_mode=args.input_mode, + anno=args.anno, fasta=args.fasta, + organism_name=args.genome_name, + circular_contigs=args.circular_contigs, + pangenome_params=pangenome_params, + cpu=args.cpu, + use_pseudo=args.use_pseudo, + disable_bar=args.disable_prog_bar, + tmpdir=args.tmpdir, config=args.config) input_org_to_lonely_genes_count = annotate_input_genes_with_pangenome_families(pangenome, input_organisms=organisms, - output=output_dir, cpu=args.cpu, use_representatives=args.fast, - no_defrag=args.no_defrag, identity=args.identity, - coverage=args.coverage, tmpdir=args.tmpdir, - translation_table=int(pangenome_params.cluster.translation_table), - keep_tmp=args.keep_tmp, - disable_bar=args.disable_prog_bar) - + output=output_dir, cpu=args.cpu, + use_representatives=args.fast, + no_defrag=args.no_defrag, + identity=args.identity, + coverage=args.coverage, + tmpdir=args.tmpdir, + translation_table=int( + pangenome_params.cluster.translation_table), + keep_tmp=args.keep_tmp, + disable_bar=args.disable_prog_bar) input_org_2_rgps, input_org_to_spots, input_orgs_to_modules = {}, {}, {} @@ -1250,38 +1252,41 @@ def launch(args: argparse.Namespace): logging.getLogger('PPanGGOLiN').info('Detecting RGPs in input genomes.') - input_org_2_rgps = predict_RGP(pangenome, organisms, persistent_penalty=pangenome_params.rgp.persistent_penalty, variable_gain=pangenome_params.rgp.variable_gain, - min_length=pangenome_params.rgp.min_length, min_score=pangenome_params.rgp.min_score, multigenics=multigenics, output_dir=output_dir, - disable_bar=args.disable_prog_bar, compress=args.compress) + input_org_2_rgps = predict_RGP(pangenome, organisms, persistent_penalty=pangenome_params.rgp.persistent_penalty, + variable_gain=pangenome_params.rgp.variable_gain, + min_length=pangenome_params.rgp.min_length, + min_score=pangenome_params.rgp.min_score, multigenics=multigenics, + output_dir=output_dir, + disable_bar=args.disable_prog_bar, compress=args.compress) if project_spots: logging.getLogger('PPanGGOLiN').info('Predicting spot of insertion in input genomes.') input_org_to_spots = predict_spots_in_input_organisms(initial_spots=list(pangenome.spots), - initial_regions=pangenome.regions, - input_org_2_rgps=input_org_2_rgps, - multigenics=multigenics, - output=output_dir, - write_graph_flag=args.spot_graph, - graph_formats=args.graph_formats, - overlapping_match=pangenome_params.spot.overlapping_match, - set_size=pangenome_params.spot.set_size, - exact_match=pangenome_params.spot.exact_match_size, - compress=args.compress) + initial_regions=pangenome.regions, + input_org_2_rgps=input_org_2_rgps, + multigenics=multigenics, + output=output_dir, + write_graph_flag=args.spot_graph, + graph_formats=args.graph_formats, + overlapping_match=pangenome_params.spot.overlapping_match, + set_size=pangenome_params.spot.set_size, + exact_match=pangenome_params.spot.exact_match_size, + compress=args.compress) if project_modules: input_orgs_to_modules = project_and_write_modules(pangenome, organisms, output_dir, compress=args.compress) write_projection_results(pangenome, organisms, input_org_2_rgps, - input_org_to_spots, - input_orgs_to_modules, - input_org_to_lonely_genes_count, - write_proksee=args.proksee, write_gff=args.gff, write_table=args.table, - add_sequences=args.add_sequences, - genome_name_to_path=genome_name_to_path, input_type=input_type, - output_dir=output_dir, dup_margin=args.dup_margin, soft_core=args.soft_core, - metadata_sep=args.metadata_sep, compress=args.compress, - need_modules=project_modules, need_spots=project_spots, need_regions=predict_rgp) - + input_org_to_spots, + input_orgs_to_modules, + input_org_to_lonely_genes_count, + write_proksee=args.proksee, write_gff=args.gff, write_table=args.table, + add_sequences=args.add_sequences, + genome_name_to_path=genome_name_to_path, input_type=input_type, + output_dir=output_dir, dup_margin=args.dup_margin, soft_core=args.soft_core, + metadata_sep=args.metadata_sep, compress=args.compress, + need_modules=project_modules, need_spots=project_spots, need_regions=predict_rgp) + def subparser(sub_parser: argparse._SubParsersAction) -> argparse.ArgumentParser: """ @@ -1355,12 +1360,12 @@ def parser_projection(parser: argparse.ArgumentParser): help="minimum ratio of genomes in which the family must have multiple genes " "for it to be considered 'duplicated'. " "This metric is used to compute completeness and duplication of the input genomes") - + optional.add_argument("--soft_core", required=False, type=restricted_float, default=0.95, - help="Soft core threshold used when generating general statistics on the projected genome. " - "This threshold does not influence PPanGGOLiN's partitioning. " - "The value determines the minimum fraction of genomes that must possess a gene family " - "for it to be considered part of the soft core.") + help="Soft core threshold used when generating general statistics on the projected genome. " + "This threshold does not influence PPanGGOLiN's partitioning. " + "The value determines the minimum fraction of genomes that must possess a gene family " + "for it to be considered part of the soft core.") optional.add_argument("--spot_graph", required=False, action="store_true", help="Write the spot graph to a file, with pairs of blocks of single copy markers flanking RGPs " @@ -1374,13 +1379,13 @@ def parser_projection(parser: argparse.ArgumentParser): optional.add_argument("--proksee", required=False, action="store_true", help="Generate JSON map files for PROKSEE with projected pangenome annotations for each input genome.") - + optional.add_argument("--table", required=False, action="store_true", help="Generate a tsv file for each input genome with pangenome annotations.") - + optional.add_argument("--compress", required=False, action="store_true", help="Compress the files in .gz") - + optional.add_argument("--add_sequences", required=False, action="store_true", help="Include input genome DNA sequences in GFF and Proksee output.") @@ -1392,21 +1397,21 @@ def parser_projection(parser: argparse.ArgumentParser): optional.add_argument("--keep_tmp", required=False, default=False, action="store_true", help="Keeping temporary files (useful for debugging).") - + optional.add_argument("--add_metadata", - required=False, - action="store_true", - help="Include metadata information in the output files " - "if any have been added to pangenome elements (see ppanggolin metadata command).") + required=False, + action="store_true", + help="Include metadata information in the output files " + "if any have been added to pangenome elements (see ppanggolin metadata command).") optional.add_argument("--metadata_sources", - default=None, - nargs="+", - help="Which source of metadata should be written. " - "By default all metadata sources are included.") + default=None, + nargs="+", + help="Which source of metadata should be written. " + "By default all metadata sources are included.") optional.add_argument("--metadata_sep", - required=False, - default='|', - help="The separator used to join multiple metadata values for elements with multiple metadata" - " values from the same source. This character should not appear in metadata values.") + required=False, + default='|', + help="The separator used to join multiple metadata values for elements with multiple metadata" + " values from the same source. This character should not appear in metadata values.") From 1516f124eaa929454728c99e29c645dc9e4d7823 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= Date: Fri, 7 Jun 2024 15:28:31 +0200 Subject: [PATCH 21/25] Improve GitHub workflow --- .github/workflows/main.yml | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 2abe5b7f..88088f66 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -95,12 +95,12 @@ jobs: ppanggolin write_pangenome -p stepbystep/pangenome.h5 --output stepbystep -f --soft_core 0.9 --dup_margin 0.06 --gexf --light_gexf --csv --Rtab --stats --partitions --compress --json --spots --regions --borders --families_tsv --cpu 1 ppanggolin write_genomes -p stepbystep/pangenome.h5 --output stepbystep -f --fasta genomes.fasta.list --gff --proksee --table ppanggolin fasta -p stepbystep/pangenome.h5 --output stepbystep -f --prot_families all --gene_families shell --regions all --fasta genomes.fasta.list - ppanggolin fasta -p stepbystep/pangenome.h5 --output stepbystep -f --prot_families rgp --gene_families rgp + ppanggolin fasta -p stepbystep/pangenome.h5 --output stepbystep -f --prot_families rgp --gene_families rgp --compress ppanggolin fasta -p stepbystep/pangenome.h5 --output stepbystep -f --prot_families softcore --gene_families softcore ppanggolin fasta -p stepbystep/pangenome.h5 --output stepbystep -f --prot_families module_0 - ppanggolin fasta -p stepbystep/pangenome.h5 --output stepbystep -f --prot_families core - ppanggolin fasta -p stepbystep/pangenome.h5 --output stepbystep -f --gene_families module_0 --genes module_0 - ppanggolin fasta -p stepbystep/pangenome.h5 --output stepbystep -f --proteins cloud --cpu $NUM_CPUS --keep_tmp + ppanggolin fasta -p stepbystep/pangenome.h5 --output stepbystep -f --genes core --proteins cloud + ppanggolin fasta -p stepbystep/pangenome.h5 --output stepbystep -f --gene_families module_0 --genes module_0 --compress + ppanggolin fasta -p stepbystep/pangenome.h5 --output stepbystep -f --proteins cloud --cpu $NUM_CPUS --keep_tmp --compress ppanggolin draw -p stepbystep/pangenome.h5 --draw_spots --spots all -o stepbystep -f ppanggolin metrics -p stepbystep/pangenome.h5 --genome_fluidity --no_print_info --recompute_metrics --log metrics.log @@ -180,6 +180,8 @@ jobs: head genomes.gbff.list | sed 's/^/input_genome_/g' > genomes.gbff.head.list ppanggolin projection --pangenome stepbystep/pangenome.h5 -o projection_from_list_of_gbff --anno genomes.gbff.head.list --gff --proksee --cpu $NUM_CPUS + head genomes.fasta.list | sed 's/^/input_genome_/g' > genomes.fasta.head.list + ppanggolin projection --pangenome myannopang/pangenome.h5 -o projection_from_list_of_fasta --fasta genomes.fasta.head.list --gff --proksee --cpu $NUM_CPUS ppanggolin projection --pangenome mybasicpangenome/pangenome.h5 -o projection_from_single_fasta \ --genome_name chlam_A --fasta FASTA/GCF_002776845.1_ASM277684v1_genomic.fna.gz \ From e909a6525778464562a5cecbec46c3c71124c326 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Mon, 10 Jun 2024 16:14:46 +0200 Subject: [PATCH 22/25] simplify unit test --- pyproject.toml | 6 -- tests/align/test_align.py | 159 ++++++++++++++++---------------------- 2 files changed, 65 insertions(+), 100 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index abdbf2d7..7864f533 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -81,9 +81,3 @@ packages = ["ppanggolin"] [tool.setuptools.dynamic] version = {file = "VERSION"} - -[tool.pytest.ini_options] -markers = [ - "nucleotides: marks tests to run with nucleotides alphabets", - "aminoacids: marks tests to run with amino-acids alphabets", -] \ No newline at end of file diff --git a/tests/align/test_align.py b/tests/align/test_align.py index e5ce2dde..42d1cd5f 100644 --- a/tests/align/test_align.py +++ b/tests/align/test_align.py @@ -6,106 +6,77 @@ @pytest.fixture -def nucleotides(): - yield ['A', 'C', 'G', 'T', 'N'] +def single_line_fasta_nt() -> List: + + return ['>Gene_1 seq_description\n', + 'ATGCGTTGTCGTTG\n', + ">Gene_2\n", + "TGTGACCTGCT\n" + ] @pytest.fixture -def aminoacids(): - amino_acid_alphabet = [ - 'A', # Alanine - 'R', # Arginine - 'N', # Asparagine - 'D', # Aspartic acid - 'C', # Cysteine - 'E', # Glutamic acid - 'Q', # Glutamine - 'G', # Glycine - 'H', # Histidine - 'I', # Isoleucine - 'L', # Leucine - 'K', # Lysine - 'M', # Methionine - 'F', # Phenylalanine - 'P', # Proline - 'S', # Serine - 'T', # Threonine - 'W', # Tryptophan - 'Y', # Tyrosine - 'V' # Valine - ] - yield amino_acid_alphabet +def single_line_fasta_aa() -> List: + + return ['>Gene_1 seq_description\n', + 'YWTPRPFFYAAEYNN\n', + ">Gene_2\n", + "YWTPRPSYWTPAAEYNN\n" + ] @pytest.fixture -def number_of_sequences(): - yield randint(4, 10) -@pytest.fixture -def single_line_fasta(request, tmp_path_factory: pytest.TempPathFactory, number_of_sequences, - aminoacids: List[str], nucleotides: List[str]): - if request.node.get_closest_marker('aminoacids'): - alphabet = aminoacids - fasta_path = tmp_path_factory.getbasetemp() / "single_line_nt.fasta" - else: - alphabet = nucleotides - fasta_path = tmp_path_factory.getbasetemp() / "single_line_aa.fasta" - with open(fasta_path, "w") as fasta: - for i in range(number_of_sequences): - fasta.write(f">Gene_{i}\n") - fasta.write("".join([choice(alphabet) for _ in range(0, randint(30, 100))])) - fasta.write("\n") - yield fasta_path +def multi_line_fasta_nt() -> List: + + return ['>Gene_1 seq_description\n', + 'ATGCGT\n', + 'TGTCGTTG\n', + ">Gene_2\n", + "TGTGACCTGCT\n" + ] @pytest.fixture -def multi_line_fasta(request, tmp_path_factory: pytest.TempPathFactory, number_of_sequences, - aminoacids: List[str], nucleotides: List[str]): - if request.node.get_closest_marker('aminoacids'): - alphabet = aminoacids - fasta_path = tmp_path_factory.getbasetemp() / "single_line_nt.fasta" - else: - alphabet = nucleotides - fasta_path = tmp_path_factory.getbasetemp() / "single_line_aa.fasta" - with open(fasta_path, "w") as fasta: - for i in range(number_of_sequences): - fasta.write(f">Gene_{i}\n") - for j in range(randint(4,10)): - fasta.write("".join([choice(alphabet) for _ in range(60)])) - fasta.write("\n") - yield fasta_path - -@pytest.mark.nucleotides -def test_get_seq_ids_single_line_nt(number_of_sequences, single_line_fasta): - with open(single_line_fasta, "r") as fasta: - seq_set, is_nucleotide, single_line_fasta = get_seq_ids(fasta) - assert len(seq_set) == number_of_sequences - assert seq_set == {f"Gene_{i}" for i in range(number_of_sequences)} - assert is_nucleotide - assert single_line_fasta - -@pytest.mark.aminoacids -def test_get_seq_ids_single_line_aa(number_of_sequences, single_line_fasta): - with open(single_line_fasta, "r") as fasta: - seq_set, is_nucleotide, single_line_fasta = get_seq_ids(fasta) - assert len(seq_set) == number_of_sequences - assert seq_set == {f"Gene_{i}" for i in range(number_of_sequences)} - assert not is_nucleotide - assert single_line_fasta - -@pytest.mark.nucleotides -def test_get_seq_ids_multi_line_nt(number_of_sequences, multi_line_fasta): - with open(multi_line_fasta, "r") as fasta: - seq_set, is_nucleotide, single_line_fasta = get_seq_ids(fasta) - assert len(seq_set) == number_of_sequences - assert seq_set == {f"Gene_{i}" for i in range(number_of_sequences)} - assert is_nucleotide - assert not single_line_fasta - -@pytest.mark.aminoacids -def test_get_seq_ids_multi_line_aa(number_of_sequences, multi_line_fasta): - with open(multi_line_fasta, "r") as fasta: - seq_set, is_nucleotide, single_line_fasta = get_seq_ids(fasta) - assert len(seq_set) == number_of_sequences - assert seq_set == {f"Gene_{i}" for i in range(number_of_sequences)} - assert not is_nucleotide - assert not single_line_fasta +def multi_line_fasta_aa() -> List: + + return ['>Gene_1 seq_description\n', + 'AAEYNN\n', + 'YWTPRPFFY\n', + ">Gene_2\n", + "YWTPRPS\n", + "YWTPAAEYNN\n" + ] + + +def test_get_seq_ids_single_line_nt(single_line_fasta_nt): + + + seq_set, is_nucleotide, single_line_fasta = get_seq_ids(single_line_fasta_nt) + assert len(seq_set) == 2 + assert seq_set == {'Gene_1', 'Gene_2'} + assert is_nucleotide + assert single_line_fasta + +def test_get_seq_ids_single_line_aa(single_line_fasta_aa): + seq_set, is_nucleotide, single_line_fasta = get_seq_ids(single_line_fasta_aa) + assert len(seq_set) == 2 + assert seq_set == {'Gene_1', 'Gene_2'} + assert not is_nucleotide + assert single_line_fasta + +def test_get_seq_ids_multi_line_nt(multi_line_fasta_nt): + + seq_set, is_nucleotide, single_line_fasta = get_seq_ids(multi_line_fasta_nt) + + assert len(seq_set) == 2 + assert seq_set == {'Gene_1', 'Gene_2'} + assert is_nucleotide + assert not single_line_fasta + +def test_get_seq_ids_multi_line_aa(multi_line_fasta_aa): + + seq_set, is_nucleotide, single_line_fasta = get_seq_ids(multi_line_fasta_aa) + assert len(seq_set) == 2 + assert seq_set == {'Gene_1', 'Gene_2'} + assert not is_nucleotide + assert not single_line_fasta From 0e69f70f7dc79c4a47e080b4337bbd9a75e90152 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= Date: Mon, 10 Jun 2024 16:39:48 +0200 Subject: [PATCH 23/25] Solve requested change --- ppanggolin/cluster/cluster.py | 1 - ppanggolin/formats/writeSequences.py | 2 +- tests/pytest.ini | 4 ++++ 3 files changed, 5 insertions(+), 2 deletions(-) create mode 100644 tests/pytest.ini diff --git a/ppanggolin/cluster/cluster.py b/ppanggolin/cluster/cluster.py index faab8873..e11793ec 100644 --- a/ppanggolin/cluster/cluster.py +++ b/ppanggolin/cluster/cluster.py @@ -501,7 +501,6 @@ def parser_clust(parser: argparse.ArgumentParser): "with their original gene family.") clust.add_argument("--translation_table", required=False, default="11", help="Translation table (genetic code) to use.") - # clust.add_argument("--compress") clust.add_argument("-c", "--cpu", required=False, default=1, type=int, help="Number of available cpus") diff --git a/ppanggolin/formats/writeSequences.py b/ppanggolin/formats/writeSequences.py index 272e1a8a..5f188f99 100644 --- a/ppanggolin/formats/writeSequences.py +++ b/ppanggolin/formats/writeSequences.py @@ -501,7 +501,7 @@ def write_regions_sequences(pangenome: Pangenome, output: Path, regions: str, fa for line in read_compressed_or_not(organisms_file): elements = [el.strip() for el in line.split("\t")] if len(elements) <= 1: - raise SyntaxError(f"No tabulation separator found in given --fasta or --anno file: '{organisms_file}'") + raise ValueError(f"No tabulation separator found in given --fasta or --anno file: '{organisms_file}'") org_dict[elements[0]] = Path(elements[1]) if not org_dict[elements[0]].exists(): # Check tsv sanity test if it's not one it's the other org_dict[elements[0]] = organisms_file.parent.joinpath(org_dict[elements[0]]) diff --git a/tests/pytest.ini b/tests/pytest.ini new file mode 100644 index 00000000..393e1386 --- /dev/null +++ b/tests/pytest.ini @@ -0,0 +1,4 @@ +[pytest] +markers = + nucleotides: marks tests to run with nucleotides alphabets + aminoacids: marks tests to run with amino-acids alphabets \ No newline at end of file From 93a4b01543d456c0f8ab8883ece6a928173adeb9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= Date: Mon, 10 Jun 2024 17:30:41 +0200 Subject: [PATCH 24/25] Make alignment with MMSeqs2 more flexible. --- ppanggolin/align/alignOnPang.py | 46 +++++++++++++------------ ppanggolin/context/searchGeneContext.py | 11 +++--- ppanggolin/projection/projection.py | 9 +++-- 3 files changed, 33 insertions(+), 33 deletions(-) diff --git a/ppanggolin/align/alignOnPang.py b/ppanggolin/align/alignOnPang.py index 2b5547d3..ca72ed9b 100644 --- a/ppanggolin/align/alignOnPang.py +++ b/ppanggolin/align/alignOnPang.py @@ -27,7 +27,7 @@ def align_seq_to_pang(target_seq_file: Union[Path, Iterable[Path]], query_seq_files: Union[Path, Iterable[Path]], tmpdir: Path, cpu: int = 1, no_defrag: bool = False, identity: float = 0.8, coverage: float = 0.8, - is_query_nt: bool = False, is_query_slf: bool = False, is_target_nt: bool = False, + query_type: str = "unknow", is_query_slf: bool = False, target_type: str = "unknow", is_target_slf: bool = False, translation_table: int = None) -> Path: """ Align fasta sequence to pangenome sequences. @@ -39,34 +39,36 @@ def align_seq_to_pang(target_seq_file: Union[Path, Iterable[Path]], query_seq_fi :param no_defrag: Do not apply defragmentation :param identity: minimal identity threshold for the alignment :param coverage: minimal identity threshold for the alignment - :param is_query_nt: Is the sequence file (query) are nucleotide sequences. If True, sequences are translated by mmseqs + :param query_type: Sequences type of the file (query). [nucleotide, protein, unknow] :param is_query_slf: Is the sequence file (query) with single line fasta. If True, MMSeqs2 database will be with soft link - :param is_target_nt: Is the sequences of pangenome (target) are nucleotide sequences. If True, sequences are translated by mmseqs + :param target_type: Sequences type of pangenome (target). [nucleotide, aminoacid, protein] :param is_target_slf: Is the sequences of pangenome (target) with single line fasta. If True, MMSeqs2 database will be with soft link :param translation_table: Translation table to use, if sequences are nucleotide and need to be translated. :return: Alignment result file """ - if is_target_nt: + if target_type == "nucleotide": logging.getLogger("PPanGGOLiN").debug("Target sequences will be translated by mmseqs with " f"translation table {translation_table}") with create_tmpdir(tmpdir, basename="target_db", keep_tmp=True) as target_db_dir: # Keep is set as true because whether tmpdir is deleted or not target_db_dir will be the same target_db = translate_genes(target_seq_file, target_db_dir, cpu, is_target_slf, translation_table) else: + db_type = 1 if target_type == "protein" else 0 target_db = create_mmseqs_db([target_seq_file] if isinstance(target_seq_file, Path) else target_seq_file, - 'target_db', tmpdir, db_mode=1 if is_target_slf else 0, db_type=1) + 'target_db', tmpdir, db_mode=1 if is_target_slf else 0, db_type=db_type) - if is_query_nt: + if query_type == "nucleotide": logging.getLogger("PPanGGOLiN").debug("Query sequences will be translated by mmseqs " f"with translation table {translation_table}") with create_tmpdir(tmpdir, basename="query_db", keep_tmp=True) as query_db_dir: # Keep is set as true because whether tmpdir is deleted or not target_db_dir will be the same query_db = translate_genes(query_seq_files, query_db_dir, cpu, is_query_slf, translation_table) else: + db_type = 1 if query_type == "protein" else 0 query_db = create_mmseqs_db([query_seq_files] if isinstance(query_seq_files, Path) else query_seq_files, - 'query_db', tmpdir, db_mode=1 if is_query_slf else 0, db_type=1) + 'query_db', tmpdir, db_mode=1 if is_query_slf else 0, db_type=db_type) cov_mode = "2" # coverage of query if no_defrag: @@ -393,7 +395,7 @@ def get_seq_info(seq_to_pang: dict, pangenome: Pangenome, output: Path, draw_rel def get_input_seq_to_family_with_rep(pangenome: Pangenome, sequence_files: Union[Path, Iterable[Path]], output: Path, - tmpdir: Path, is_input_seq_nt: bool, is_input_slf: bool = False, cpu: int = 1, + tmpdir: Path, input_type: str = "unknow", is_input_slf: bool = False, cpu: int = 1, no_defrag: bool = False, identity: float = 0.8, coverage: float = 0.8, translation_table: int = 11, disable_bar: bool = False ) -> Tuple[Path, Dict[str, GeneFamily]]: @@ -407,7 +409,7 @@ def get_input_seq_to_family_with_rep(pangenome: Pangenome, sequence_files: Union :param sequence_files: Iterable of paths of FASTA files containing input sequences to align. :param output: Path to the output directory where alignment results will be stored. :param tmpdir: Temporary directory for intermediate files. - :param is_input_seq_nt: Is input sequence file nucleotide sequences. + :param input_type: Type of input sequence file. [nucleotide, aminoacid, unknow] :param is_input_slf: Is the sequence file with single line fasta. If True, MMSeqs2 database will be with soft link :param cpu: Number of CPU cores to use for the alignment (default: 1). :param no_defrag: If True, the defragmentation workflow is skipped (default: False). @@ -424,10 +426,10 @@ def get_input_seq_to_family_with_rep(pangenome: Pangenome, sequence_files: Union logging.getLogger("PPanGGOLiN").debug(f'Write gene family sequences in {pangenome_sequences.absolute()}') write_gene_fam_sequences(pangenome, pangenome_sequences, add="ppanggolin_", disable_bar=disable_bar) - align_file = align_seq_to_pang(target_seq_file=pangenome_sequences, query_seq_files=sequence_files, - tmpdir=tmpdir, cpu=cpu, no_defrag=no_defrag, identity=identity, coverage=coverage, - is_query_nt=is_input_seq_nt, is_query_slf=is_input_slf, - is_target_nt=False, is_target_slf=True, translation_table=translation_table) + align_file = align_seq_to_pang(target_seq_file=pangenome_sequences, query_seq_files=sequence_files, tmpdir=tmpdir, + cpu=cpu, no_defrag=no_defrag, identity=identity, coverage=coverage, + query_type=input_type, is_query_slf=is_input_slf, is_target_slf=True, + target_type="protein", translation_table=translation_table) seq2pang, align_file = map_input_gene_to_family_rep_aln(align_file, output, pangenome) @@ -435,7 +437,7 @@ def get_input_seq_to_family_with_rep(pangenome: Pangenome, sequence_files: Union def get_input_seq_to_family_with_all(pangenome: Pangenome, sequence_files: Union[Path, Iterable[Path]], output: Path, - tmpdir: Path, is_input_seq_nt: bool, is_input_slf: bool = False, cpu: int = 1, + tmpdir: Path, input_type: str = "unknow", is_input_slf: bool = False, cpu: int = 1, no_defrag: bool = False, identity: float = 0.8, coverage: float = 0.8, translation_table: int = 11, disable_bar: bool = False ) -> Tuple[Path, Dict[str, GeneFamily]]: @@ -449,7 +451,7 @@ def get_input_seq_to_family_with_all(pangenome: Pangenome, sequence_files: Unio :param sequence_files: Iterable of paths of FASTA files containing input sequences to align. :param output: Path to the output directory where alignment results will be stored. :param tmpdir: Temporary directory for intermediate files. - :param is_input_seq_nt: Is input sequence file nucleotide sequences. + :param input_type: Sequences type of the file (query). [nucleotide, protein, unknow] :param is_input_slf: Is the sequence file with single line fasta. If True, MMSeqs2 database will be with soft link :param cpu: Number of CPU cores to use for the alignment (default: 1). :param no_defrag: If True, the defragmentation workflow is skipped (default: False). @@ -467,8 +469,8 @@ def get_input_seq_to_family_with_all(pangenome: Pangenome, sequence_files: Unio align_file = align_seq_to_pang(target_seq_file=pangenome_sequences, query_seq_files=sequence_files, tmpdir=tmpdir, cpu=cpu, no_defrag=no_defrag, identity=identity, coverage=coverage, - is_query_nt=is_input_seq_nt, is_query_slf=is_input_slf, is_target_nt=True, - is_target_slf=True, translation_table=translation_table) + query_type=input_type, is_query_slf=is_input_slf, is_target_slf=True, + target_type="nucleotide", translation_table=translation_table) seq2pang, align_file = map_input_gene_to_family_all_aln(align_file, output, pangenome) @@ -519,19 +521,19 @@ def align(pangenome: Pangenome, sequence_file: Path, output: Path, identity: flo seq_set, is_nucleotide, single_line_fasta = get_seq_ids(seqFileObj) with create_tmpdir(main_dir=tmpdir, basename="align_input_seq_tmpdir", keep_tmp=keep_tmp) as new_tmpdir: - + input_type = "nucleotide" if is_nucleotide else "unknow" if use_representatives: align_file, seq2pang = get_input_seq_to_family_with_rep(pangenome, sequence_file, output, new_tmpdir, - is_input_seq_nt=is_nucleotide, - is_input_slf=single_line_fasta, - cpu=cpu, no_defrag=no_defrag, identity=identity, + input_type=input_type, + is_input_slf=single_line_fasta, cpu=cpu, + no_defrag=no_defrag, identity=identity, coverage=coverage, translation_table=translation_table, disable_bar=disable_bar) else: align_file, seq2pang = get_input_seq_to_family_with_all(pangenome=pangenome, sequence_files=sequence_file, output=output, tmpdir=new_tmpdir, - is_input_seq_nt=is_nucleotide, + input_type=input_type, is_input_slf=single_line_fasta, cpu=cpu, no_defrag=no_defrag, identity=identity, coverage=coverage, diff --git a/ppanggolin/context/searchGeneContext.py b/ppanggolin/context/searchGeneContext.py index 508203fc..512d12a6 100644 --- a/ppanggolin/context/searchGeneContext.py +++ b/ppanggolin/context/searchGeneContext.py @@ -74,18 +74,17 @@ def search_gene_context_in_pangenome(pangenome: Pangenome, output: Path, tmpdir: logging.debug(f"Input sequences are {'nucleotide' if is_nucleotide else 'protein'} sequences") with create_tmpdir(main_dir=tmpdir, basename="align_input_seq_tmpdir", keep_tmp=keep_tmp) as new_tmpdir: - + input_type = "nucleotide" if is_nucleotide else "unknow" if use_representatives: _, seqid2fam = get_input_seq_to_family_with_rep(pangenome, sequence_file, output, new_tmpdir, - is_input_seq_nt=is_nucleotide, is_input_slf=is_slf, - cpu=cpu, no_defrag=no_defrag, - identity=identity, coverage=coverage, - translation_table=translation_table, + input_type=input_type, is_input_slf=is_slf, cpu=cpu, + no_defrag=no_defrag, identity=identity, + coverage=coverage, translation_table=translation_table, disable_bar=disable_bar) else: _, seqid2fam = get_input_seq_to_family_with_all(pangenome=pangenome, sequence_files=sequence_file, output=output, tmpdir=new_tmpdir, - is_input_seq_nt=is_nucleotide, is_input_slf=is_slf, + input_type=input_type, is_input_slf=is_slf, cpu=cpu, no_defrag=no_defrag, identity=identity, coverage=coverage, translation_table=translation_table, diff --git a/ppanggolin/projection/projection.py b/ppanggolin/projection/projection.py index 08c0da5f..78cfc0d3 100644 --- a/ppanggolin/projection/projection.py +++ b/ppanggolin/projection/projection.py @@ -558,17 +558,16 @@ def annotate_input_genes_with_pangenome_families(pangenome: Pangenome, input_org with create_tmpdir(main_dir=tmpdir, basename="projection_tmp", keep_tmp=keep_tmp) as new_tmpdir: if use_representatives: _, seqid_to_gene_family = get_input_seq_to_family_with_rep(pangenome=pangenome, - sequence_files=seq_fasta_file, - output=new_tmpdir, tmpdir=new_tmpdir, - is_input_seq_nt=True, is_input_slf=True, - cpu=cpu, no_defrag=no_defrag, + sequence_files=seq_fasta_file, output=new_tmpdir, + tmpdir=new_tmpdir, input_type="nucleotide", + is_input_slf=True, cpu=cpu, no_defrag=no_defrag, identity=identity, coverage=coverage, translation_table=translation_table) else: _, seqid_to_gene_family = get_input_seq_to_family_with_all(pangenome=pangenome, sequence_files=seq_fasta_file, output=new_tmpdir, tmpdir=new_tmpdir, - is_input_seq_nt=True, is_input_slf=True, + input_type="nucleotide", is_input_slf=True, cpu=cpu, no_defrag=no_defrag, identity=identity, coverage=coverage, translation_table=translation_table, From f30ab344592a070116dd842912e9239993e9a53e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= Date: Mon, 10 Jun 2024 17:31:31 +0200 Subject: [PATCH 25/25] clean old code --- ppanggolin/projection/projection.py | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/ppanggolin/projection/projection.py b/ppanggolin/projection/projection.py index 78cfc0d3..8a734201 100644 --- a/ppanggolin/projection/projection.py +++ b/ppanggolin/projection/projection.py @@ -536,21 +536,10 @@ def annotate_input_genes_with_pangenome_families(pangenome: Pangenome, input_org :return: Number of genes that do not cluster with any of the gene families of the pangenome. """ - seq_fasta_files = [] - logging.getLogger('PPanGGOLiN').info('Writing gene sequences of input genomes.') input_genes = [gene for org in input_organisms for gene in org.genes] - # for input_organism in input_organisms: - # seq_outdir = output / input_organism.name - # mk_outdir(seq_outdir, force=True) - # - # seq_fasta_file = seq_outdir / "cds_sequences.fasta" - # - # write_gene_sequences_from_annotations(input_organism.genes, seq_fasta_file, disable_bar=True, add="ppanggolin_") - # - # seq_fasta_files.append(seq_fasta_file) seq_fasta_file = output / 'input_genes.fasta' write_gene_sequences_from_annotations(input_genes, seq_fasta_file, disable_bar=True, add='ppanggolin_')