From 31e50f5197aa2f2bbef6eb82c3f8564eddbfdbce Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= <arnoux.jeromepj@gmail.com>
Date: Thu, 28 Mar 2024 16:56:50 +0100
Subject: [PATCH 01/25] Add option to write proteins sequences from genes in
 pangenome

---
 ppanggolin/cluster/cluster.py        | 14 +----
 ppanggolin/formats/writeSequences.py | 86 ++++++++++++++++++++++++----
 2 files changed, 78 insertions(+), 22 deletions(-)

diff --git a/ppanggolin/cluster/cluster.py b/ppanggolin/cluster/cluster.py
index 14418e2a..9ce34b2a 100644
--- a/ppanggolin/cluster/cluster.py
+++ b/ppanggolin/cluster/cluster.py
@@ -23,7 +23,7 @@
 from ppanggolin.utils import read_compressed_or_not, restricted_float
 from ppanggolin.formats.writeBinaries import write_pangenome, erase_pangenome
 from ppanggolin.formats.readBinaries import check_pangenome_info, get_gene_sequences_from_file
-from ppanggolin.formats.writeSequences import write_gene_sequences_from_annotations
+from ppanggolin.formats.writeSequences import write_gene_sequences_from_annotations, translate_genes
 from ppanggolin.utils import mk_outdir
 
 
@@ -68,7 +68,6 @@ def check_pangenome_for_clustering(pangenome: Pangenome, tmp_file: TextIO, force
                         "or provide a way to access the gene sequence during the annotation step "
                         "(having the fasta in the gff files, or providing the fasta files through the --fasta option)")
 
-
 def first_clustering(sequences: TextIO, tmpdir: Path, cpu: int = 1, code: int = 11, coverage: float = 0.8,
                      identity: float = 0.8, mode: int = 1) -> Tuple[Path, Path]:
     """
@@ -84,16 +83,7 @@ def first_clustering(sequences: TextIO, tmpdir: Path, cpu: int = 1, code: int =
 
     :return: path to representative sequence file and path to tsv clustering result
     """
-    seq_nucdb = tmpdir / 'nucleotid_sequences_db'
-    cmd = list(map(str, ["mmseqs", "createdb", sequences.name, seq_nucdb]))
-    logging.getLogger("PPanGGOLiN").debug(" ".join(cmd))
-    logging.getLogger("PPanGGOLiN").info("Creating sequence database...")
-    subprocess.run(cmd, stdout=subprocess.DEVNULL, check=True)
-    logging.getLogger("PPanGGOLiN").debug("Translate sequence ...")
-    seqdb = tmpdir / 'aa_db'
-    cmd = list(map(str, ["mmseqs", "translatenucs", seq_nucdb, seqdb, "--threads", cpu, "--translation-table", code]))
-    logging.getLogger("PPanGGOLiN").debug(" ".join(cmd))
-    subprocess.run(cmd, stdout=subprocess.DEVNULL, check=True)
+    seqdb = translate_genes(sequences, tmpdir, cpu, code)
     logging.getLogger("PPanGGOLiN").info("Clustering sequences...")
     cludb = tmpdir / 'cluster_db'
     cmd = list(map(str, ["mmseqs", "cluster", seqdb, cludb, tmpdir, "--cluster-mode", mode, "--min-seq-id",
diff --git a/ppanggolin/formats/writeSequences.py b/ppanggolin/formats/writeSequences.py
index 900e3496..63a78231 100644
--- a/ppanggolin/formats/writeSequences.py
+++ b/ppanggolin/formats/writeSequences.py
@@ -5,8 +5,11 @@
 import argparse
 import logging
 import re
+import subprocess
 from pathlib import Path
 from typing import TextIO, Dict, Set, Iterable
+import tempfile
+import shutil
 
 # installed libraries
 from tqdm import tqdm
@@ -18,6 +21,7 @@
 from ppanggolin.utils import write_compressed_or_not, mk_outdir, read_compressed_or_not, restricted_float, detect_filetype
 from ppanggolin.formats.readBinaries import check_pangenome_info, get_gene_sequences_from_file
 
+
 module_regex = re.compile(r'^module_\d+')  #\d == [0-9]
 poss_values = ['all', 'persistent', 'shell', 'cloud', 'rgp', 'softcore', 'core', module_regex]
 poss_values_log = f"Possible values are {', '.join(poss_values[:-1])}, module_X with X being a module id."
@@ -43,6 +47,20 @@ def write_gene_sequences_from_annotations(genes_to_write: Iterable[Gene], file_o
     file_obj.flush()
 
 
+def translate_genes(sequences: TextIO, tmpdir: Path, cpu: int = 1, code: int = 11) -> Path:
+    seq_nucdb = tmpdir / 'nucleotid_sequences_db'
+    cmd = list(map(str, ["mmseqs", "createdb", sequences.name, seq_nucdb]))
+    logging.getLogger("PPanGGOLiN").debug(" ".join(cmd))
+    logging.getLogger("PPanGGOLiN").info("Creating sequence database...")
+    subprocess.run(cmd, stdout=subprocess.DEVNULL, check=True)
+    logging.getLogger("PPanGGOLiN").debug("Translate sequence ...")
+    seqdb = tmpdir / 'aa_db'
+    cmd = list(map(str, ["mmseqs", "translatenucs", seq_nucdb, seqdb, "--threads", cpu, "--translation-table", code]))
+    logging.getLogger("PPanGGOLiN").debug(" ".join(cmd))
+    subprocess.run(cmd, stdout=subprocess.DEVNULL, check=True)
+    return seqdb
+
+
 def write_gene_sequences(pangenome: Pangenome, output: Path, genes: str, soft_core: float = 0.95,
                          compress: bool = False, disable_bar: bool = False):
     """
@@ -78,6 +96,40 @@ def write_gene_sequences(pangenome: Pangenome, output: Path, genes: str, soft_co
     logging.getLogger("PPanGGOLiN").info(f"Done writing the gene sequences : '{outpath}'")
 
 
+def write_gene_protein_sequences(pangenome: Pangenome, output: Path, genes_prot: str, soft_core: float = 0.95,
+                                 compress: bool = False, keep_tmp: bool = False, tmp: Path = None,
+                                 cpu: int = 1, code: int = 11, disable_bar: bool = False):
+    """ Write all amino acid sequences from given genes in pangenome
+
+    :param pangenome: Pangenome object with gene families sequences
+    :param output: Path to output directory
+    :param genes_prot: Selected partition of gene
+    :param soft_core: Soft core threshold to use
+    :param compress: Compress the file in .gz
+    :param keep_tmp: Keep temporary directory
+    :param tmp: Path to temporary directory
+    :param cpu: Number of CPU available
+    :param code: Genetic code use to translate nucleotide sequences to protein sequences
+    :param disable_bar: Disable progress bar
+    """
+    tmpdir = tmp/"translateGenes" if tmp is not None else Path(f"{tempfile.gettempdir()}/translateGenes")
+    mk_outdir(tmpdir, True, True)
+
+    write_gene_sequences(pangenome, tmpdir, genes_prot, soft_core, compress, disable_bar)
+
+    with open(tmpdir/f"{genes_prot}_genes.fna") as sequences:
+        translate_db = translate_genes(sequences, tmpdir, cpu, code)
+    outpath = output/f"{genes_prot}_protein_genes.fna"
+    cmd = list(map(str, ["mmseqs", "convert2fasta", translate_db, outpath]))
+    logging.getLogger("PPanGGOLiN").debug(" ".join(cmd))
+    subprocess.run(cmd, stdout=subprocess.DEVNULL, check=True)
+    logging.getLogger("PPanGGOLiN").info(f"Done writing the gene sequences : '{outpath}'")
+
+    if not keep_tmp:
+        logging.getLogger("PPanGGOLiN").debug("Clean temporary directory")
+        shutil.rmtree(tmpdir)
+
+
 def select_families(pangenome: Pangenome, partition: str, type_name: str, soft_core: float = 0.95) -> Set[GeneFamily]:
     """
     function used to filter down families to the given partition
@@ -93,13 +145,13 @@ def select_families(pangenome: Pangenome, partition: str, type_name: str, soft_c
     if partition == 'all':
         logging.getLogger("PPanGGOLiN").info(f"Writing all of the {type_name}...")
         genefams = pangenome.gene_families
-    
+
     elif partition in ['persistent', 'shell', 'cloud']:
         logging.getLogger("PPanGGOLiN").info(f"Writing the {type_name} of the {partition}...")
         for fam in pangenome.gene_families:
             if fam.named_partition == partition:
                 genefams.add(fam)
-    
+
     elif partition == "rgp":
         logging.getLogger("PPanGGOLiN").info(f"Writing the {type_name} in RGPs...")
         for region in pangenome.regions:
@@ -342,8 +394,8 @@ def write_regions_sequences(pangenome: Pangenome, output: Path, regions: str, fa
 
 
 def write_sequence_files(pangenome: Pangenome, output: Path, fasta: Path = None, anno: Path = None,
-                         soft_core: float = 0.95, regions: str = None, genes: str = None, gene_families: str = None,
-                         prot_families: str = None, compress: bool = False, disable_bar: bool = False):
+                         soft_core: float = 0.95, regions: str = None, genes: str = None, genes_prot: str = None,
+                         gene_families: str = None, prot_families: str = None, compress: bool = False, disable_bar: bool = False):
     """
     Main function to write sequence file from pangenome
 
@@ -359,7 +411,7 @@ def write_sequence_files(pangenome: Pangenome, output: Path, fasta: Path = None,
     :param compress: Compress the file in .gz
     :param disable_bar: Disable progress bar
     """
-    if not any(x for x in [regions, genes, prot_families, gene_families]):
+    if not any(x for x in [regions, genes, genes_prot, prot_families, gene_families]):
         raise Exception("You did not indicate what file you wanted to write.")
 
     need_annotations = False
@@ -375,7 +427,7 @@ def write_sequence_files(pangenome: Pangenome, output: Path, fasta: Path = None,
         if prot_families in ["core", "softcore"]:
             need_annotations = True
 
-    if any(x is not None for x in [regions, genes, gene_families]):
+    if any(x is not None for x in [regions, genes, genes_prot, gene_families]):
         need_annotations = True
         need_families = True
     if regions is not None or any(x == "rgp" for x in (genes, gene_families, prot_families)):
@@ -394,6 +446,8 @@ def write_sequence_files(pangenome: Pangenome, output: Path, fasta: Path = None,
         provided_filter = ''
         if genes is not None:
             provided_filter = genes
+        if genes_prot is not None:
+            provided_filter = genes_prot
         if gene_families is not None:
             provided_filter = gene_families
         if prot_families is not None:
@@ -422,6 +476,9 @@ def write_sequence_files(pangenome: Pangenome, output: Path, fasta: Path = None,
         write_fasta_gene_fam(pangenome, output, gene_families, soft_core, compress, disable_bar)
     if genes is not None:
         write_gene_sequences(pangenome, output, genes, soft_core, compress, disable_bar)
+    if genes_prot is not None:
+        write_gene_protein_sequences(pangenome, output, genes_prot, soft_core, compress,
+                                     disable_bar=disable_bar)
     if regions is not None:
         write_regions_sequences(pangenome, output, regions, fasta, anno, compress, disable_bar)
 
@@ -439,7 +496,7 @@ def launch(args: argparse.Namespace):
     pangenome = Pangenome()
     pangenome.add_file(args.pangenome)
     write_sequence_files(pangenome, args.output, fasta=args.fasta, anno=args.anno, soft_core=args.soft_core,
-                         regions=args.regions, genes=args.genes, gene_families=args.gene_families,
+                         regions=args.regions, genes=args.genes, genes_prot=args.genes_prot, gene_families=args.gene_families,
                          prot_families=args.prot_families, compress=args.compress, disable_bar=args.disable_prog_bar)
 
 
@@ -477,7 +534,7 @@ def parser_seq(parser: argparse.ArgumentParser):
 
     :param parser: parser for align argument
     """
-    
+
     required = parser.add_argument_group(title="Required arguments",
                                          description="One of the following arguments is required :")
     required.add_argument('-p', '--pangenome', required=False, type=Path, help="The pangenome .h5 file")
@@ -493,7 +550,7 @@ def parser_seq(parser: argparse.ArgumentParser):
                          help="A tab-separated file listing the genome names, and the gff/gbff filepath of its "
                               "annotations (the files can be compressed with gzip). One line per genome. "
                               "If this is provided, those annotations will be used.")
-    
+
     onereq = parser.add_argument_group(title="Output file",
                                        description="At least one of the following argument is required. "
                                                    "Indicating 'all' writes all elements. Writing a partition "
@@ -502,11 +559,13 @@ def parser_seq(parser: argparse.ArgumentParser):
                                        )
     onereq.add_argument("--genes", required=False, type=filter_values,
                         help=f"Write all nucleotide CDS sequences. {poss_values_log}")
+    onereq.add_argument("--genes_prot", required=False, type=filter_values,
+                        help=f"Write representative amino acid sequences of genes. {poss_values_log}")
     onereq.add_argument("--prot_families", required=False, type=filter_values,
                         help=f"Write representative amino acid sequences of gene families. {poss_values_log}")
     onereq.add_argument("--gene_families", required=False, type=filter_values,
                         help=f"Write representative nucleotide sequences of gene families. {poss_values_log}")
-    
+
     optional = parser.add_argument_group(title="Optional arguments")
     # could make choice to allow customization
     optional.add_argument("--regions", required=False, type=str, choices=["all", "complete"],
@@ -515,6 +574,13 @@ def parser_seq(parser: argparse.ArgumentParser):
     optional.add_argument("--soft_core", required=False, type=restricted_float, default=0.95,
                           help="Soft core threshold to use if 'softcore' partition is chosen")
     optional.add_argument("--compress", required=False, action="store_true", help="Compress the files in .gz")
+    optional.add_argument("--translation_table", required=False, default="11",
+                          help="Translation table (genetic code) to use.")
+    optional.add_argument("-c", "--cpu", required=False, default=1, type=int, help="Number of available cpus")
+    optional.add_argument("--tmpdir", required=False, type=str, default=Path(tempfile.gettempdir()),
+                          help="directory for storing temporary files")
+    optional.add_argument("--keep_tmp", required=False, default=False, action="store_true",
+                          help="Keeping temporary files (useful for debugging).")
 
 
 if __name__ == '__main__':

From e54565c9adbdc46383b03c78fb9144a0271ac20f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= <arnoux.jeromepj@gmail.com>
Date: Thu, 28 Mar 2024 17:25:52 +0100
Subject: [PATCH 02/25] Split function to add check function

---
 ppanggolin/formats/writeSequences.py | 155 ++++++++++++++++-----------
 1 file changed, 93 insertions(+), 62 deletions(-)

diff --git a/ppanggolin/formats/writeSequences.py b/ppanggolin/formats/writeSequences.py
index 63a78231..a3753d46 100644
--- a/ppanggolin/formats/writeSequences.py
+++ b/ppanggolin/formats/writeSequences.py
@@ -27,6 +27,94 @@
 poss_values_log = f"Possible values are {', '.join(poss_values[:-1])}, module_X with X being a module id."
 
 
+def check_write_sequences_args(args: argparse.Namespace) -> None:
+    """Check arguments compatibility in CLI
+
+    :param args: argparse namespace arguments from CLI
+
+    :raises argparse.ArgumentTypeError: if region is given but neither fasta or anno is given
+    """
+    if args.regions is not None and args.fasta is None and args.anno is None:
+        raise argparse.ArgumentError(argument=None,
+                                     message="The --regions options requires the use of --anno or --fasta "
+                                             "(You need to provide the same file used to compute the pangenome)")
+
+
+def check_pangenome_to_write_sequences(pangenome: Pangenome, regions: str = None, genes: str = None,
+                                       genes_prot: str = None, gene_families: str = None,
+                                       prot_families: str = None, disable_bar: bool = False):
+    """Check and load required information from pangenome
+
+    :param pangenome: Empty pangenome
+    :param regions: Check and load the RGP
+    :param genes: Check and load the genes
+    :param genes_prot: Write amino acid CDS sequences.
+    :param gene_families: Check and load the gene families to write representative nucleotide sequences.
+    :param prot_families: Check and load the gene families to write representative amino acid sequences.
+    :param disable_bar: Disable progress bar
+
+    :raises AssertionError: if not any arguments to write any file is given
+    :raises ValueError: if the given filter is not recognized
+    :raises AttributeError: if the pangenome does not contain the required information
+    """
+    if not any(x for x in [regions, genes, genes_prot, prot_families, gene_families]):
+        raise AssertionError("You did not indicate what file you wanted to write.")
+
+    need_annotations = False
+    need_families = False
+    need_graph = False
+    need_partitions = False
+    need_spots = False
+    need_regions = False
+    need_modules = False
+
+    if prot_families is not None:
+        need_families = True
+        if prot_families in ["core", "softcore"]:
+            need_annotations = True
+
+    if any(x is not None for x in [regions, genes, genes_prot, gene_families]):
+        need_annotations = True
+        need_families = True
+    if regions is not None or any(x == "rgp" for x in (genes, gene_families, prot_families)):
+        need_annotations = True
+        need_regions = True
+    if any(x in ["persistent", "shell", "cloud"] for x in (genes, gene_families, prot_families)):
+        need_partitions = True
+    for x in (genes, gene_families, prot_families):
+        if x is not None and 'module_' in x:
+            need_modules = True
+
+    if not (need_annotations or need_families or need_graph or need_partitions or
+            need_spots or need_regions or need_modules):
+        # then nothing is needed, then something is wrong.
+        # find which filter was provided
+        provided_filter = ''
+        if genes is not None:
+            provided_filter = genes
+        if genes_prot is not None:
+            provided_filter = genes_prot
+        if gene_families is not None:
+            provided_filter = gene_families
+        if prot_families is not None:
+            provided_filter = prot_families
+        if regions is not None:
+            provided_filter = regions
+        raise ValueError(f"The filter that you indicated '{provided_filter}' was not understood by PPanGGOLiN. "
+                         f"{poss_values_log}")
+
+    if pangenome.status["geneSequences"] not in ["inFile"] and (genes or gene_families):
+        raise AttributeError("The provided pangenome has no gene sequences. "
+                             "This is not compatible with any of the following options : --genes, --gene_families")
+    if pangenome.status["geneFamilySequences"] not in ["Loaded", "Computed", "inFile"] and prot_families:
+        raise AttributeError("The provided pangenome has no gene families. This is not compatible with any of "
+                             "the following options : --prot_families, --gene_families")
+
+    check_pangenome_info(pangenome, need_annotations=need_annotations, need_families=need_families,
+                         need_graph=need_graph, need_partitions=need_partitions, need_rgp=need_regions,
+                         need_spots=need_spots, need_modules=need_modules, disable_bar=disable_bar)
+
+
 def write_gene_sequences_from_annotations(genes_to_write: Iterable[Gene], file_obj: TextIO, add: str = '',
                                           disable_bar: bool = False):
     """
@@ -46,7 +134,6 @@ def write_gene_sequences_from_annotations(genes_to_write: Iterable[Gene], file_o
             file_obj.write(f'{gene.dna}\n')
     file_obj.flush()
 
-
 def translate_genes(sequences: TextIO, tmpdir: Path, cpu: int = 1, code: int = 11) -> Path:
     seq_nucdb = tmpdir / 'nucleotid_sequences_db'
     cmd = list(map(str, ["mmseqs", "createdb", sequences.name, seq_nucdb]))
@@ -395,7 +482,8 @@ def write_regions_sequences(pangenome: Pangenome, output: Path, regions: str, fa
 
 def write_sequence_files(pangenome: Pangenome, output: Path, fasta: Path = None, anno: Path = None,
                          soft_core: float = 0.95, regions: str = None, genes: str = None, genes_prot: str = None,
-                         gene_families: str = None, prot_families: str = None, compress: bool = False, disable_bar: bool = False):
+                         gene_families: str = None, prot_families: str = None, compress: bool = False,
+                         disable_bar: bool = False):
     """
     Main function to write sequence file from pangenome
 
@@ -406,69 +494,14 @@ def write_sequence_files(pangenome: Pangenome, output: Path, fasta: Path = None,
     :param soft_core: Soft core threshold to use
     :param regions: Write the RGP nucleotide sequences
     :param genes: Write all nucleotide CDS sequences
+    :param genes_prot: Write amino acid CDS sequences.
     :param gene_families: Write representative nucleotide sequences of gene families.
     :param prot_families: Write representative amino acid sequences of gene families.
     :param compress: Compress the file in .gz
     :param disable_bar: Disable progress bar
     """
-    if not any(x for x in [regions, genes, genes_prot, prot_families, gene_families]):
-        raise Exception("You did not indicate what file you wanted to write.")
 
-    need_annotations = False
-    need_families = False
-    need_graph = False
-    need_partitions = False
-    need_spots = False
-    need_regions = False
-    need_modules = False
-
-    if prot_families is not None:
-        need_families = True
-        if prot_families in ["core", "softcore"]:
-            need_annotations = True
-
-    if any(x is not None for x in [regions, genes, genes_prot, gene_families]):
-        need_annotations = True
-        need_families = True
-    if regions is not None or any(x == "rgp" for x in (genes, gene_families, prot_families)):
-        need_annotations = True
-        need_regions = True
-    if any(x in ["persistent", "shell", "cloud"] for x in (genes, gene_families, prot_families)):
-        need_partitions = True
-    for x in (genes, gene_families, prot_families):
-        if x is not None and 'module_' in x:
-            need_modules = True
-
-    if not (need_annotations or need_families or need_graph or need_partitions or
-            need_spots or need_regions or need_modules):
-        # then nothing is needed, then something is wrong.
-        # find which filter was provided
-        provided_filter = ''
-        if genes is not None:
-            provided_filter = genes
-        if genes_prot is not None:
-            provided_filter = genes_prot
-        if gene_families is not None:
-            provided_filter = gene_families
-        if prot_families is not None:
-            provided_filter = prot_families
-        if regions is not None:
-            provided_filter = regions
-        raise Exception(f"The filter that you indicated '{provided_filter}' was not understood by PPanGGOLiN. "
-                        f"{poss_values_log}")
-    ex_gene_sequences = Exception("The provided pangenome has no gene sequences. "
-                                  "This is not compatible with any of the following options : --genes, --gene_families")
-    ex_gene_family_sequences = Exception("The provided pangenome has no gene families. "
-                                         "This is not compatible with any of the following options : "
-                                         "--prot_families, --gene_families")
-    if pangenome.status["geneSequences"] not in ["inFile"] and (genes or gene_families):
-        raise ex_gene_sequences
-    if pangenome.status["geneFamilySequences"] not in ["Loaded", "Computed", "inFile"] and prot_families:
-        raise ex_gene_family_sequences
-
-    check_pangenome_info(pangenome, need_annotations=need_annotations, need_families=need_families,
-                         need_graph=need_graph, need_partitions=need_partitions, need_rgp=need_regions,
-                         need_spots=need_spots, need_modules=need_modules, disable_bar=disable_bar)
+    check_pangenome_to_write_sequences(pangenome, regions, genes, genes_prot, gene_families, prot_families, disable_bar)
 
     if prot_families is not None:
         write_fasta_prot_fam(pangenome, output, prot_families, soft_core, compress, disable_bar)
@@ -489,9 +522,7 @@ def launch(args: argparse.Namespace):
 
     :param args: All arguments provide by user
     """
-    if args.regions is not None and args.fasta is None and args.anno is None:
-        raise Exception("The --regions options requires the use of --anno or --fasta "
-                        "(You need to provide the same file used to compute the pangenome)")
+    check_write_sequences_args(args)
     mk_outdir(args.output, args.force)
     pangenome = Pangenome()
     pangenome.add_file(args.pangenome)

From 035330ecaafe34544579d69cb03e6d5e0c17d96f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= <arnoux.jeromepj@gmail.com>
Date: Thu, 28 Mar 2024 17:34:49 +0100
Subject: [PATCH 03/25] Little refactoring

---
 ppanggolin/formats/writeSequences.py | 47 +++++++++++++++++++---------
 1 file changed, 33 insertions(+), 14 deletions(-)

diff --git a/ppanggolin/formats/writeSequences.py b/ppanggolin/formats/writeSequences.py
index a3753d46..c68eac5e 100644
--- a/ppanggolin/formats/writeSequences.py
+++ b/ppanggolin/formats/writeSequences.py
@@ -18,11 +18,11 @@
 from ppanggolin.pangenome import Pangenome
 from ppanggolin.geneFamily import GeneFamily
 from ppanggolin.genome import Gene, Organism
-from ppanggolin.utils import write_compressed_or_not, mk_outdir, read_compressed_or_not, restricted_float, detect_filetype
+from ppanggolin.utils import write_compressed_or_not, mk_outdir, read_compressed_or_not, restricted_float, \
+    detect_filetype
 from ppanggolin.formats.readBinaries import check_pangenome_info, get_gene_sequences_from_file
 
-
-module_regex = re.compile(r'^module_\d+')  #\d == [0-9]
+module_regex = re.compile(r'^module_\d+')  # \d == [0-9]
 poss_values = ['all', 'persistent', 'shell', 'cloud', 'rgp', 'softcore', 'core', module_regex]
 poss_values_log = f"Possible values are {', '.join(poss_values[:-1])}, module_X with X being a module id."
 
@@ -134,8 +134,18 @@ def write_gene_sequences_from_annotations(genes_to_write: Iterable[Gene], file_o
             file_obj.write(f'{gene.dna}\n')
     file_obj.flush()
 
+
 def translate_genes(sequences: TextIO, tmpdir: Path, cpu: int = 1, code: int = 11) -> Path:
-    seq_nucdb = tmpdir / 'nucleotid_sequences_db'
+    """Translate nucleotide sequences into MMSeqs2 amino acid sequences database
+
+    :param sequences: File with the nucleotide sequences
+    :param tmpdir: Temporary directory to save the MMSeqs2 files
+    :param cpu: Number of available CPU cores to use
+    :param code: Translation code to use
+
+    :return: Path to the MMSeqs2 database
+    """
+    seq_nucdb = tmpdir / 'nucleotide_sequences_db'
     cmd = list(map(str, ["mmseqs", "createdb", sequences.name, seq_nucdb]))
     logging.getLogger("PPanGGOLiN").debug(" ".join(cmd))
     logging.getLogger("PPanGGOLiN").info("Creating sequence database...")
@@ -159,6 +169,8 @@ def write_gene_sequences(pangenome: Pangenome, output: Path, genes: str, soft_co
     :param soft_core: Soft core threshold to use
     :param compress: Compress the file in .gz
     :param disable_bar: Disable progress bar
+
+    :raises AttributeError: If the pangenome does not contain gene sequences
     """
 
     logging.getLogger("PPanGGOLiN").info("Writing all the gene nucleotide sequences...")
@@ -179,7 +191,7 @@ def write_gene_sequences(pangenome: Pangenome, output: Path, genes: str, soft_co
             write_gene_sequences_from_annotations(genes_to_write, fasta, disable_bar=disable_bar)
         else:
             # this should never happen if the pangenome has been properly checked before launching this function.
-            raise Exception("The pangenome does not include gene sequences")
+            raise AttributeError("The pangenome does not include gene sequences")
     logging.getLogger("PPanGGOLiN").info(f"Done writing the gene sequences : '{outpath}'")
 
 
@@ -199,14 +211,14 @@ def write_gene_protein_sequences(pangenome: Pangenome, output: Path, genes_prot:
     :param code: Genetic code use to translate nucleotide sequences to protein sequences
     :param disable_bar: Disable progress bar
     """
-    tmpdir = tmp/"translateGenes" if tmp is not None else Path(f"{tempfile.gettempdir()}/translateGenes")
+    tmpdir = tmp / "translateGenes" if tmp is not None else Path(f"{tempfile.gettempdir()}/translateGenes")
     mk_outdir(tmpdir, True, True)
 
     write_gene_sequences(pangenome, tmpdir, genes_prot, soft_core, compress, disable_bar)
 
-    with open(tmpdir/f"{genes_prot}_genes.fna") as sequences:
+    with open(tmpdir / f"{genes_prot}_genes.fna") as sequences:
         translate_db = translate_genes(sequences, tmpdir, cpu, code)
-    outpath = output/f"{genes_prot}_protein_genes.fna"
+    outpath = output / f"{genes_prot}_protein_genes.fna"
     cmd = list(map(str, ["mmseqs", "convert2fasta", translate_db, outpath]))
     logging.getLogger("PPanGGOLiN").debug(" ".join(cmd))
     subprocess.run(cmd, stdout=subprocess.DEVNULL, check=True)
@@ -396,6 +408,9 @@ def read_genome_file(genome_file: Path, organism: Organism) -> Dict[str, str]:
     :param organism: organism object
 
     :return: Dictionary with all sequences associated to contig
+
+    :raises TypeError: If the file containing sequences is not recognized
+    :raises KeyError: If their inconsistency between pangenome contigs and the given contigs
     """
     filetype = detect_filetype(genome_file)
     if filetype in ["fasta", "gff"]:
@@ -403,12 +418,12 @@ def read_genome_file(genome_file: Path, organism: Organism) -> Dict[str, str]:
     elif filetype == "gbff":
         contig_to_sequence = read_fasta_gbk(genome_file)
     else:
-        raise Exception(f"Unknown filetype detected: '{genome_file}'")
+        raise TypeError(f"Unknown filetype detected: '{genome_file}'")
 
     # check_contig_names
     if set(contig_to_sequence) != {contig.name for contig in organism.contigs}:
-        raise Exception(f"Contig name inconsistency detected in genome '{organism.name}' between the "
-                        f"information stored in the pangenome file and the contigs found in '{genome_file}'.")
+        raise KeyError(f"Contig name inconsistency detected in genome '{organism.name}' between the "
+                       f"information stored in the pangenome file and the contigs found in '{genome_file}'.")
 
     return contig_to_sequence
 
@@ -420,7 +435,7 @@ def write_spaced_fasta(sequence: str, space: int = 60) -> str:
     :param sequence: sequence to write
     :param space: maximum of size for one line
 
-    :return: a sequence of maximum space caracter
+    :return: a sequence of maximum space character
     """
     seq = ""
     j = 0
@@ -442,6 +457,8 @@ def write_regions_sequences(pangenome: Pangenome, output: Path, regions: str, fa
     :param anno: A tab-separated file listing the organism names, and the gff/gbff filepath of its annotations
     :param compress: Compress the file in .gz
     :param disable_bar: Disable progress bar
+
+    :raises SyntaxError: if no tabulation are found in list genomes file
     """
     assert fasta is not None or anno is not None, "Write regions requires to use anno or fasta, not any provided"
 
@@ -450,7 +467,7 @@ def write_regions_sequences(pangenome: Pangenome, output: Path, regions: str, fa
     for line in read_compressed_or_not(organisms_file):
         elements = [el.strip() for el in line.split("\t")]
         if len(elements) <= 1:
-            raise Exception(f"No tabulation separator found in given --fasta or --anno file: '{organisms_file}'")
+            raise SyntaxError(f"No tabulation separator found in given --fasta or --anno file: '{organisms_file}'")
         org_dict[elements[0]] = Path(elements[1])
         if not org_dict[elements[0]].exists():  # Check tsv sanity test if it's not one it's the other
             org_dict[elements[0]] = organisms_file.parent.joinpath(org_dict[elements[0]])
@@ -527,7 +544,8 @@ def launch(args: argparse.Namespace):
     pangenome = Pangenome()
     pangenome.add_file(args.pangenome)
     write_sequence_files(pangenome, args.output, fasta=args.fasta, anno=args.anno, soft_core=args.soft_core,
-                         regions=args.regions, genes=args.genes, genes_prot=args.genes_prot, gene_families=args.gene_families,
+                         regions=args.regions, genes=args.genes, genes_prot=args.genes_prot,
+                         gene_families=args.gene_families,
                          prot_families=args.prot_families, compress=args.compress, disable_bar=args.disable_prog_bar)
 
 
@@ -543,6 +561,7 @@ def subparser(sub_parser: argparse._SubParsersAction) -> argparse.ArgumentParser
     parser_seq(parser)
     return parser
 
+
 def filter_values(arg_value: str):
     """
     Check filter value to ensure they are in the expected format.

From 788cd5cfc471c844dbf3c5452dcc43472acb3076 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= <arnoux.jeromepj@gmail.com>
Date: Thu, 28 Mar 2024 17:42:33 +0100
Subject: [PATCH 04/25] Add translate arguments as kwargs

---
 ppanggolin/formats/writeSequences.py | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/ppanggolin/formats/writeSequences.py b/ppanggolin/formats/writeSequences.py
index c68eac5e..379601a1 100644
--- a/ppanggolin/formats/writeSequences.py
+++ b/ppanggolin/formats/writeSequences.py
@@ -500,7 +500,7 @@ def write_regions_sequences(pangenome: Pangenome, output: Path, regions: str, fa
 def write_sequence_files(pangenome: Pangenome, output: Path, fasta: Path = None, anno: Path = None,
                          soft_core: float = 0.95, regions: str = None, genes: str = None, genes_prot: str = None,
                          gene_families: str = None, prot_families: str = None, compress: bool = False,
-                         disable_bar: bool = False):
+                         disable_bar: bool = False, **translate_kwgs):
     """
     Main function to write sequence file from pangenome
 
@@ -528,7 +528,7 @@ def write_sequence_files(pangenome: Pangenome, output: Path, fasta: Path = None,
         write_gene_sequences(pangenome, output, genes, soft_core, compress, disable_bar)
     if genes_prot is not None:
         write_gene_protein_sequences(pangenome, output, genes_prot, soft_core, compress,
-                                     disable_bar=disable_bar)
+                                     disable_bar=disable_bar, **translate_kwgs)
     if regions is not None:
         write_regions_sequences(pangenome, output, regions, fasta, anno, compress, disable_bar)
 
@@ -540,13 +540,17 @@ def launch(args: argparse.Namespace):
     :param args: All arguments provide by user
     """
     check_write_sequences_args(args)
+    translate_kwgs = {"code": args.translation_table,
+                      "cpu": args.cpu,
+                      "tmp": args.tmpdir,
+                      "keep_tmp": args.keep_tmp}
     mk_outdir(args.output, args.force)
     pangenome = Pangenome()
     pangenome.add_file(args.pangenome)
     write_sequence_files(pangenome, args.output, fasta=args.fasta, anno=args.anno, soft_core=args.soft_core,
                          regions=args.regions, genes=args.genes, genes_prot=args.genes_prot,
-                         gene_families=args.gene_families,
-                         prot_families=args.prot_families, compress=args.compress, disable_bar=args.disable_prog_bar)
+                         gene_families=args.gene_families, prot_families=args.prot_families, compress=args.compress,
+                         disable_bar=args.disable_prog_bar, **translate_kwgs)
 
 
 def subparser(sub_parser: argparse._SubParsersAction) -> argparse.ArgumentParser:
@@ -627,7 +631,7 @@ def parser_seq(parser: argparse.ArgumentParser):
     optional.add_argument("--translation_table", required=False, default="11",
                           help="Translation table (genetic code) to use.")
     optional.add_argument("-c", "--cpu", required=False, default=1, type=int, help="Number of available cpus")
-    optional.add_argument("--tmpdir", required=False, type=str, default=Path(tempfile.gettempdir()),
+    optional.add_argument("--tmpdir", required=False, type=Path, default=Path(tempfile.gettempdir()),
                           help="directory for storing temporary files")
     optional.add_argument("--keep_tmp", required=False, default=False, action="store_true",
                           help="Keeping temporary files (useful for debugging).")

From 9cb7b3155dbce1407288e232854a3904ee70de78 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= <arnoux.jeromepj@gmail.com>
Date: Thu, 28 Mar 2024 17:56:18 +0100
Subject: [PATCH 05/25] Replace cpu by threads to be more accurate

---
 ppanggolin/formats/writeSequences.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/ppanggolin/formats/writeSequences.py b/ppanggolin/formats/writeSequences.py
index 379601a1..9e03f87b 100644
--- a/ppanggolin/formats/writeSequences.py
+++ b/ppanggolin/formats/writeSequences.py
@@ -32,7 +32,7 @@ def check_write_sequences_args(args: argparse.Namespace) -> None:
 
     :param args: argparse namespace arguments from CLI
 
-    :raises argparse.ArgumentTypeError: if region is given but neither fasta or anno is given
+    :raises argparse.ArgumentTypeError: if region is given but neither fasta nor anno is given
     """
     if args.regions is not None and args.fasta is None and args.anno is None:
         raise argparse.ArgumentError(argument=None,
@@ -135,12 +135,12 @@ def write_gene_sequences_from_annotations(genes_to_write: Iterable[Gene], file_o
     file_obj.flush()
 
 
-def translate_genes(sequences: TextIO, tmpdir: Path, cpu: int = 1, code: int = 11) -> Path:
+def translate_genes(sequences: TextIO, tmpdir: Path, threads: int = 1, code: int = 11) -> Path:
     """Translate nucleotide sequences into MMSeqs2 amino acid sequences database
 
     :param sequences: File with the nucleotide sequences
     :param tmpdir: Temporary directory to save the MMSeqs2 files
-    :param cpu: Number of available CPU cores to use
+    :param threads: Number of available threads to use
     :param code: Translation code to use
 
     :return: Path to the MMSeqs2 database
@@ -152,7 +152,7 @@ def translate_genes(sequences: TextIO, tmpdir: Path, cpu: int = 1, code: int = 1
     subprocess.run(cmd, stdout=subprocess.DEVNULL, check=True)
     logging.getLogger("PPanGGOLiN").debug("Translate sequence ...")
     seqdb = tmpdir / 'aa_db'
-    cmd = list(map(str, ["mmseqs", "translatenucs", seq_nucdb, seqdb, "--threads", cpu, "--translation-table", code]))
+    cmd = list(map(str, ["mmseqs", "translatenucs", seq_nucdb, seqdb, "--threads", threads, "--translation-table", code]))
     logging.getLogger("PPanGGOLiN").debug(" ".join(cmd))
     subprocess.run(cmd, stdout=subprocess.DEVNULL, check=True)
     return seqdb
@@ -197,7 +197,7 @@ def write_gene_sequences(pangenome: Pangenome, output: Path, genes: str, soft_co
 
 def write_gene_protein_sequences(pangenome: Pangenome, output: Path, genes_prot: str, soft_core: float = 0.95,
                                  compress: bool = False, keep_tmp: bool = False, tmp: Path = None,
-                                 cpu: int = 1, code: int = 11, disable_bar: bool = False):
+                                 threads: int = 1, code: int = 11, disable_bar: bool = False):
     """ Write all amino acid sequences from given genes in pangenome
 
     :param pangenome: Pangenome object with gene families sequences
@@ -207,7 +207,7 @@ def write_gene_protein_sequences(pangenome: Pangenome, output: Path, genes_prot:
     :param compress: Compress the file in .gz
     :param keep_tmp: Keep temporary directory
     :param tmp: Path to temporary directory
-    :param cpu: Number of CPU available
+    :param threads: Number of threads available
     :param code: Genetic code use to translate nucleotide sequences to protein sequences
     :param disable_bar: Disable progress bar
     """
@@ -217,7 +217,7 @@ def write_gene_protein_sequences(pangenome: Pangenome, output: Path, genes_prot:
     write_gene_sequences(pangenome, tmpdir, genes_prot, soft_core, compress, disable_bar)
 
     with open(tmpdir / f"{genes_prot}_genes.fna") as sequences:
-        translate_db = translate_genes(sequences, tmpdir, cpu, code)
+        translate_db = translate_genes(sequences, tmpdir, threads, code)
     outpath = output / f"{genes_prot}_protein_genes.fna"
     cmd = list(map(str, ["mmseqs", "convert2fasta", translate_db, outpath]))
     logging.getLogger("PPanGGOLiN").debug(" ".join(cmd))
@@ -541,7 +541,7 @@ def launch(args: argparse.Namespace):
     """
     check_write_sequences_args(args)
     translate_kwgs = {"code": args.translation_table,
-                      "cpu": args.cpu,
+                      "threads": args.threads,
                       "tmp": args.tmpdir,
                       "keep_tmp": args.keep_tmp}
     mk_outdir(args.output, args.force)
@@ -630,7 +630,7 @@ def parser_seq(parser: argparse.ArgumentParser):
     optional.add_argument("--compress", required=False, action="store_true", help="Compress the files in .gz")
     optional.add_argument("--translation_table", required=False, default="11",
                           help="Translation table (genetic code) to use.")
-    optional.add_argument("-c", "--cpu", required=False, default=1, type=int, help="Number of available cpus")
+    optional.add_argument("--threads", required=False, default=1, type=int, help="Number of available threads")
     optional.add_argument("--tmpdir", required=False, type=Path, default=Path(tempfile.gettempdir()),
                           help="directory for storing temporary files")
     optional.add_argument("--keep_tmp", required=False, default=False, action="store_true",

From 12186a5ba59a8239491d7f817e1a2d99fb295b3b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= <arnoux.jeromepj@gmail.com>
Date: Thu, 28 Mar 2024 17:56:30 +0100
Subject: [PATCH 06/25] Write test and documentation for the new option

---
 .github/workflows/main.yml |  1 +
 docs/user/writeFasta.md    | 28 +++++++++++++++++++++++-----
 2 files changed, 24 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index 67c4ad2d..5db304dd 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -77,6 +77,7 @@ jobs:
         ppanggolin fasta -p stepbystep/pangenome.h5 --output stepbystep -f --prot_families module_0
         ppanggolin fasta -p stepbystep/pangenome.h5 --output stepbystep -f --prot_families core
         ppanggolin fasta -p stepbystep/pangenome.h5 --output stepbystep -f --gene_families module_0 --genes module_0
+        ppanggolin fasta -p stepbystep/pangenome.h5 --output stepbystep -f --genes_prot -c 1 --keep_tmp 
 
         ppanggolin draw -p stepbystep/pangenome.h5 --draw_spots --spots all -o stepbystep -f
         ppanggolin metrics -p stepbystep/pangenome.h5 --genome_fluidity --no_print_info --recompute_metrics --log metrics.log
diff --git a/docs/user/writeFasta.md b/docs/user/writeFasta.md
index ac1ece26..e0ba5013 100644
--- a/docs/user/writeFasta.md
+++ b/docs/user/writeFasta.md
@@ -18,7 +18,9 @@ When using the `softcore` filter, the `--soft_core` option can be used to modify
 
 ## Genes
 
-This option can be used to write the nucleotide CDS sequences. It can be used as such, to write all of the genes of the pangenome for example:
+### Nucleotide sequences
+
+This option can be used to write the nucleotide CDS sequences. It can be used as such, to write all the genes of the pangenome for example:
 
 ```bash
 ppanggolin fasta -p pangenome.h5 --output MY_GENES --genes all
@@ -30,8 +32,25 @@ Or to write only the persistent genes:
 ppanggolin fasta -p pangenome.h5 --output MY_GENES --genes persistent
 ```
 
+### Protein sequences
+
+This option can be used to write the amino acid CDS sequences. It can be used as such, to write all the genes of the pangenome for example:
+
+```bash
+ppanggolin fasta -p pangenome.h5 --output MY_GENES --genes_prot all
+```
+
+Or to write only the cloud genes:
+
+```bash
+ppanggolin fasta -p pangenome.h5 --output MY_GENES --genes_prot cloud
+```
+
+To translate the genes sequences, PPanGGOLiN use [MMSeqs2](https://github.com/soedinglab/MMseqs2) `translatenucs` command. So for this option you can give multiple threads with `--threads`. You can also specify the translation table to use with `--translate_table`. Finally, you can keep the [MMSeqs2](https://github.com/soedinglab/MMseqs2) that are generated in the temporary directory (that you can also be specified with `--tmpdir`) by indicate the option `--keep_tmp`.
 
-## Protein families
+## Gene families
+
+### Protein sequences
 
 This option can be used to write the protein sequences of the representative sequences for each family. It can be used as such for all families:
 
@@ -39,14 +58,13 @@ This option can be used to write the protein sequences of the representative seq
 ppanggolin fasta -p pangenome.h5 --output MY_PROT --prot_families all
 ```
 
-or for all of the shell families for example:
+or for all the shell families for example:
 
 ```bash
 ppanggolin fasta -p pangenome.h5 --output MY_PROT --prot_families shell
 ```
 
-
-## Gene families
+### Nucleotide sequences
 
 This option can be used to write the gene sequences of the representative sequences for each family. It can be used as such:
 

From 7febe1bb20c77fe1eab8531e1b1d291f89cf54bc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= <arnoux.jeromepj@gmail.com>
Date: Fri, 29 Mar 2024 15:04:59 +0100
Subject: [PATCH 07/25] Fix call new option for GitHub action

---
 .github/workflows/main.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index 5db304dd..26c79513 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -77,7 +77,7 @@ jobs:
         ppanggolin fasta -p stepbystep/pangenome.h5 --output stepbystep -f --prot_families module_0
         ppanggolin fasta -p stepbystep/pangenome.h5 --output stepbystep -f --prot_families core
         ppanggolin fasta -p stepbystep/pangenome.h5 --output stepbystep -f --gene_families module_0 --genes module_0
-        ppanggolin fasta -p stepbystep/pangenome.h5 --output stepbystep -f --genes_prot -c 1 --keep_tmp 
+        ppanggolin fasta -p stepbystep/pangenome.h5 --output stepbystep -f --genes_prot cloud -c 1 --keep_tmp 
 
         ppanggolin draw -p stepbystep/pangenome.h5 --draw_spots --spots all -o stepbystep -f
         ppanggolin metrics -p stepbystep/pangenome.h5 --genome_fluidity --no_print_info --recompute_metrics --log metrics.log

From ee283cb0a7846f517fc31eaeebe0be1374f0a398 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= <arnoux.jeromepj@gmail.com>
Date: Fri, 29 Mar 2024 15:55:52 +0100
Subject: [PATCH 08/25] Fix call new option for GitHub action

---
 .github/workflows/main.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index 26c79513..b0ccea1b 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -77,7 +77,7 @@ jobs:
         ppanggolin fasta -p stepbystep/pangenome.h5 --output stepbystep -f --prot_families module_0
         ppanggolin fasta -p stepbystep/pangenome.h5 --output stepbystep -f --prot_families core
         ppanggolin fasta -p stepbystep/pangenome.h5 --output stepbystep -f --gene_families module_0 --genes module_0
-        ppanggolin fasta -p stepbystep/pangenome.h5 --output stepbystep -f --genes_prot cloud -c 1 --keep_tmp 
+        ppanggolin fasta -p stepbystep/pangenome.h5 --output stepbystep -f --genes_prot cloud --threads 1 --keep_tmp 
 
         ppanggolin draw -p stepbystep/pangenome.h5 --draw_spots --spots all -o stepbystep -f
         ppanggolin metrics -p stepbystep/pangenome.h5 --genome_fluidity --no_print_info --recompute_metrics --log metrics.log

From 922073d725b9c9d39ec54151d83cd39976fd4f7f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= <arnoux.jeromepj@gmail.com>
Date: Wed, 10 Apr 2024 11:01:03 +0200
Subject: [PATCH 09/25] Refactoring

---
 ppanggolin/annotate/synta.py   |  4 ++--
 ppanggolin/formats/writeMSA.py | 21 +++++++++++----------
 2 files changed, 13 insertions(+), 12 deletions(-)

diff --git a/ppanggolin/annotate/synta.py b/ppanggolin/annotate/synta.py
index d1eaa428..e4cdaf0b 100644
--- a/ppanggolin/annotate/synta.py
+++ b/ppanggolin/annotate/synta.py
@@ -301,9 +301,9 @@ def get_dna_sequence(contig_seq: str, gene: Union[Gene, RNA]) -> str:
     :return: str
     """
     if gene.strand == "+":
-        return contig_seq[gene.start - 1:gene.stop]
+        return contig_seq[gene.start - 1: gene.stop]
     elif gene.strand == "-":
-        return reverse_complement(contig_seq[gene.start - 1:gene.stop])
+        return reverse_complement(contig_seq[gene.start - 1: gene.stop])
 
 
 def annotate_organism(org_name: str, file_name: Path, circular_contigs: List[str], tmpdir: str,
diff --git a/ppanggolin/formats/writeMSA.py b/ppanggolin/formats/writeMSA.py
index 0e419205..00f974f2 100644
--- a/ppanggolin/formats/writeMSA.py
+++ b/ppanggolin/formats/writeMSA.py
@@ -15,6 +15,7 @@
 from tqdm import tqdm
 
 # local libraries
+from ppanggolin.genome import Gene
 from ppanggolin.geneFamily import GeneFamily
 from ppanggolin.pangenome import Pangenome
 from ppanggolin.utils import mk_outdir, restricted_float
@@ -95,10 +96,10 @@ def get_families_to_write(pangenome: Pangenome, partition_filter: str = "core",
         return families
 
 
-def translate(seq: str, code: Dict[str, Dict[str, str]]) -> str:
+def translate(gene: Gene, code: Dict[str, Dict[str, str]]) -> str:
     """translates the given dna sequence with the given translation table
 
-    :param seq: given dna sequence
+    :param gene: given gene
     :param code: translation table corresponding to genetic code to use
 
     :return: protein sequence
@@ -106,17 +107,17 @@ def translate(seq: str, code: Dict[str, Dict[str, str]]) -> str:
     # code:  https://www.bioinformatics.org/sms/iupac.html
     start_table = code["start_table"]
     table = code["trans_table"]
-
-    if len(seq) % 3 == 0:
-        protein = start_table[seq[0: 3]]
-        for i in range(3, len(seq), 3):
-            codon = seq[i: i + 3]
+    if len(gene.dna) % 3 == 0:
+        protein = start_table[gene.dna[0: 3]]
+        for i in range(3, len(gene.dna), 3):
+            codon = gene.dna[i: i + 3]
             try:
                 protein += table[codon]
             except KeyError:  # codon was not planned for. Probably can't determine it.
                 protein += 'X'  # X is for unknown
     else:
-        raise IndexError("Given sequence length modulo 3 was different than 0, which is unexpected.")
+        logging.getLogger("PPANGGOLIN").error(f"Found gene length is {len(gene.dna)}")
+        raise IndexError(f"Gene {gene.ID} sequence length modulo 3 was different than 0, which is unexpected.")
     return protein
 
 
@@ -150,7 +151,7 @@ def write_fasta_families(family: GeneFamily, tmpdir: tempfile.TemporaryDirectory
         if source == "dna":
             f_obj.write(gene.dna + '\n')
         elif source == "protein":
-            f_obj.write(translate(gene.dna, code_table) + "\n")
+            f_obj.write(translate(gene, code_table) + "\n")
         else:
             raise AssertionError("Unknown sequence source given (expected 'dna' or 'protein')")
     f_obj.flush()
@@ -229,7 +230,7 @@ def write_whole_genome_msa(pangenome: Pangenome, families: set, phylo_name: Path
     """
 
     # sort familes by ID, so the gene order is consistent
-    families = sorted(families, key = lambda fam: fam.ID)
+    families = sorted(families, key=lambda f: f.ID)
 
     phylo_dict = {}
     for org in pangenome.organisms:

From 9aaac235a22b05a3fd1fa89ccaa2ebbd4870b0ac Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= <arnoux.jeromepj@gmail.com>
Date: Wed, 10 Apr 2024 14:34:35 +0200
Subject: [PATCH 10/25] Make gene translation more flexible by cutting off the
 end of genes that are not modulo 3

---
 ppanggolin/formats/writeMSA.py | 49 +++++++++++++++++++++-------------
 1 file changed, 31 insertions(+), 18 deletions(-)

diff --git a/ppanggolin/formats/writeMSA.py b/ppanggolin/formats/writeMSA.py
index 00f974f2..e1382316 100644
--- a/ppanggolin/formats/writeMSA.py
+++ b/ppanggolin/formats/writeMSA.py
@@ -96,7 +96,7 @@ def get_families_to_write(pangenome: Pangenome, partition_filter: str = "core",
         return families
 
 
-def translate(gene: Gene, code: Dict[str, Dict[str, str]]) -> str:
+def translate(gene: Gene, code: Dict[str, Dict[str, str]]) -> Tuple[str, bool]:
     """translates the given dna sequence with the given translation table
 
     :param gene: given gene
@@ -107,22 +107,25 @@ def translate(gene: Gene, code: Dict[str, Dict[str, str]]) -> str:
     # code:  https://www.bioinformatics.org/sms/iupac.html
     start_table = code["start_table"]
     table = code["trans_table"]
-    if len(gene.dna) % 3 == 0:
-        protein = start_table[gene.dna[0: 3]]
-        for i in range(3, len(gene.dna), 3):
-            codon = gene.dna[i: i + 3]
-            try:
-                protein += table[codon]
-            except KeyError:  # codon was not planned for. Probably can't determine it.
-                protein += 'X'  # X is for unknown
-    else:
-        logging.getLogger("PPANGGOLIN").error(f"Found gene length is {len(gene.dna)}")
-        raise IndexError(f"Gene {gene.ID} sequence length modulo 3 was different than 0, which is unexpected.")
-    return protein
+    mod = len(gene.dna) % 3
+    partial = False
+    if mod != 0:
+        partial = True
+        msg = (f"Gene {gene.ID} {'' if gene.local_identifier == '' else 'with local identifier ' + gene.local_identifier}"
+               f" has a sequence length of {len(gene.dna)} which modulo 3 was different than 0.")
+        logging.getLogger("PPANGGOLIN").debug(msg)
+    protein = start_table[gene.dna[0: 3]]
+    for i in range(3, len(gene.dna) - mod, 3):
+        codon = gene.dna[i: i + 3]
+        try:
+            protein += table[codon]
+        except KeyError:  # codon was not planned for. Probably can't determine it.
+            protein += 'X'  # X is for unknown
+    return protein, partial
 
 
 def write_fasta_families(family: GeneFamily, tmpdir: tempfile.TemporaryDirectory, code_table: Dict[str, Dict[str, str]],
-                         source: str = 'protein', use_gene_id: bool = False) -> Path:
+                         source: str = 'protein', use_gene_id: bool = False) -> Tuple[Path, bool]:
     """Write fasta files for each gene family
 
     :param family: gene family to write
@@ -143,6 +146,7 @@ def write_fasta_families(family: GeneFamily, tmpdir: tempfile.TemporaryDirectory
         if len(genes) == 1:
             single_copy_genes.extend(genes)
 
+    partial = False
     for gene in single_copy_genes:
         if use_gene_id:
             f_obj.write('>' + gene.ID + "\n")
@@ -151,12 +155,14 @@ def write_fasta_families(family: GeneFamily, tmpdir: tempfile.TemporaryDirectory
         if source == "dna":
             f_obj.write(gene.dna + '\n')
         elif source == "protein":
-            f_obj.write(translate(gene, code_table) + "\n")
+            protein, part = translate(gene, code_table)
+            if not partial:
+                partial = part
+            f_obj.write(protein + "\n")
         else:
             raise AssertionError("Unknown sequence source given (expected 'dna' or 'protein')")
     f_obj.flush()
-
-    return f_name
+    return f_name, partial
 
 
 def launch_mafft(fname: Path, output: Path, fam_name: str):
@@ -204,12 +210,19 @@ def compute_msa(families: Set[GeneFamily], output: Path, tmpdir: Path, cpu: int
     logging.getLogger("PPanGGOLiN").info("Preparing input files for MSA...")
     code_table = genetic_codes(code)
 
+    partial = False
     for family in tqdm(families, unit="family", disable=disable_bar):
         start_write = time.time()
-        fname = write_fasta_families(family, newtmpdir, code_table, source, use_gene_id)
+        fname, part = write_fasta_families(family, newtmpdir, code_table, source, use_gene_id)
+        if not partial:
+            partial = part
         write_total = write_total + (time.time() - start_write)
         args.append((fname, output, family.name))
 
+    if partial:
+        logging.getLogger("PPanGGOLiN").warning("Partial gene was found during translation. "
+                                                "Last nucleotides were removed to translate. "
+                                                "Use --verbose 2 to see genes that are partial")
     logging.getLogger("PPanGGOLiN").info("Computing the MSA ...")
     with get_context('fork').Pool(cpu) as p:
         with tqdm(total=len(families), unit="family", disable=disable_bar) as bar:

From 8129bfab155e01e3bcf6e380b84a134f598ef13f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= <arnoux.jeromepj@gmail.com>
Date: Mon, 27 May 2024 11:07:28 +0200
Subject: [PATCH 11/25] Fix unset variable

---
 ppanggolin/formats/writeMSA.py | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/ppanggolin/formats/writeMSA.py b/ppanggolin/formats/writeMSA.py
index d59cac81..b4f35f01 100644
--- a/ppanggolin/formats/writeMSA.py
+++ b/ppanggolin/formats/writeMSA.py
@@ -111,8 +111,9 @@ def translate(gene: Gene, code: Dict[str, Dict[str, str]]) -> Tuple[str, bool]:
     partial = False
     if mod != 0:
         partial = True
-        msg = (f"Gene {gene.ID} {'' if gene.local_identifier == '' else 'with local identifier ' + gene.local_identifier}"
-               f" has a sequence length of {len(gene.dna)} which modulo 3 was different than 0.")
+        msg = (
+            f"Gene {gene.ID} {'' if gene.local_identifier == '' else 'with local identifier ' + gene.local_identifier}"
+            f" has a sequence length of {len(gene.dna)} which modulo 3 was different than 0.")
         logging.getLogger("PPANGGOLIN").debug(msg)
     protein = start_table[gene.dna[0: 3]]
     for i in range(3, len(gene.dna) - mod, 3):
@@ -145,9 +146,9 @@ def write_fasta_families(family: GeneFamily, tmpdir: tempfile.TemporaryDirectory
     for _, genes in family.get_org_dict().items():
         if len(genes) == 1:
             single_copy_genes.extend(genes)
-            
-    with open(f_name, "w") as f_obj:
 
+    with open(f_name, "w") as f_obj:
+        partial = False
         for gene in single_copy_genes:
             if use_gene_id:
                 f_obj.write(f">{gene.ID}\n")
@@ -156,10 +157,10 @@ def write_fasta_families(family: GeneFamily, tmpdir: tempfile.TemporaryDirectory
             if source == "dna":
                 f_obj.write(gene.dna + '\n')
             elif source == "protein":
-              protein, part = translate(gene, code_table)
-              if not partial:
-                  partial = part
-              f_obj.write(protein + "\n")
+                protein, part = translate(gene, code_table)
+                if not partial:
+                    partial = part
+                f_obj.write(protein + "\n")
             else:
                 raise ValueError(f"Unknown sequence source '{source}' provided. Expected 'dna' or 'protein'.")
 
@@ -179,12 +180,12 @@ def launch_mafft(fname: Path, output: Path, fam_name: str):
     logging.getLogger("PPanGGOLiN").debug("command: " + " ".join(cmd))
     result = subprocess.run(cmd, capture_output=True)
 
-    with  open(outname, "w") as fout:
+    with open(outname, "w") as fout:
         fout.write(result.stdout.decode('utf-8'))
 
     if result.stderr and result.returncode != 0:
         raise Exception("mafft failed with the following error:\n"
-                         f"{result.stderr.decode('utf-8')}")
+                        f"{result.stderr.decode('utf-8')}")
 
 
 def launch_multi_mafft(args: List[Tuple[Path, Path, str]]):

From 02b201a6847b6a61ec3d982daceba07997d699f1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= <arnoux.jeromepj@gmail.com>
Date: Tue, 4 Jun 2024 15:36:40 +0200
Subject: [PATCH 12/25] Replace translate_with_mmseqs by translate gene

---
 ppanggolin/align/alignOnPang.py      | 104 +++++++++------------------
 ppanggolin/cluster/cluster.py        |   2 +-
 ppanggolin/formats/writeSequences.py |  36 +++++++---
 3 files changed, 60 insertions(+), 82 deletions(-)

diff --git a/ppanggolin/align/alignOnPang.py b/ppanggolin/align/alignOnPang.py
index 247545ae..c4c7bc56 100644
--- a/ppanggolin/align/alignOnPang.py
+++ b/ppanggolin/align/alignOnPang.py
@@ -9,7 +9,7 @@
 import subprocess
 import argparse
 from collections import defaultdict, Counter
-from typing import List, Tuple, Set, Dict, IO, Iterable
+from typing import List, Tuple, Set, Dict, TextIO, Iterable
 from pathlib import Path
 
 from tqdm import tqdm
@@ -22,52 +22,10 @@
 from ppanggolin.region import Spot
 from ppanggolin.figures.draw_spot import draw_selected_spots, subgraph
 from ppanggolin.formats.readBinaries import get_non_redundant_gene_sequences_from_file
+from ppanggolin.formats.writeSequences import translate_genes, create_mmseqs_db
 
 
-def create_mmseqs_db(seq_files: Iterable[Path], tmpdir: Path, basename="sequences") -> Path:
-    """
-    Create a MMseqs2 sequence database with the given fasta files.
-
-    :param seq_files: An iterable of path of FASTA files.
-    :param tmpdir: Path to the temporary directory where the database will be created.
-    :param basename: Prefix for the database file (default: "sequences").
-
-    :return: Path to the created MMseqs2 database file.
-    """
-
-    with tempfile.NamedTemporaryFile(mode="w", dir=tmpdir, delete=False, suffix=".DB", prefix=basename) as seqdb:
-        cmd = ["mmseqs", "createdb"] + [seq_file.as_posix() for seq_file in seq_files] + [seqdb.name, '--dbtype', '0']
-
-        logging.getLogger("PPanGGOLiN").debug(" ".join(cmd))
-        subprocess.run(cmd, stdout=subprocess.DEVNULL)
-
-    return Path(seqdb.name)
-
-
-def translate_with_mmseqs(seqdb: Path, translation_table: int, cpu: int, tmpdir: Path) -> Path:
-    """
-    Translate nucleotide sequences in an MMseqs2 sequence database to amino acid sequences.
-
-    :param seqdb: Path to the input MMseqs2 sequence database containing nucleotide sequences.
-    :param translation_table: The translation table to use for conversion.
-    :param cpu: Number of CPU cores to use for translation.
-    :param tmpdir: Path to the temporary directory for intermediate files.
-
-    :return: Path to the new MMseqs2 sequence database containing translated amino acid sequences.
-    """
-
-    with tempfile.NamedTemporaryFile(mode="w", dir=tmpdir, delete=False, prefix=seqdb.stem,
-                                     suffix=".aa.DB") as seqdb_aa:
-        cmd = ["mmseqs", "translatenucs", seqdb.as_posix(), seqdb_aa.name, "--translation-table",
-               f"{translation_table}", "--threads", str(cpu)]
-
-        logging.getLogger().debug(" ".join(cmd))
-        subprocess.run(cmd, stdout=subprocess.DEVNULL, check=True)
-
-    return Path(seqdb_aa.name)
-
-
-def align_seq_to_pang(target_seq_file: Path, query_seq_files: Iterable[Path],
+def align_seq_to_pang(target_seq_file: TextIO, query_seq_files: Path,
                       tmpdir: Path, cpu: int = 1, no_defrag: bool = False,
                       identity: float = 0.8, coverage: float = 0.8,
                       is_query_nt: bool = False, is_target_nt: bool = False, translation_table: int = None) -> Path:
@@ -85,21 +43,23 @@ def align_seq_to_pang(target_seq_file: Path, query_seq_files: Iterable[Path],
     :param is_target_nt: Is the sequences of pangenome (target) are nucleotide sequences. If True, sequences are translated by mmseqs
     :param translation_table: Translation table to use, if sequences are nucleotide and need to be translated.
 
-    :return: Alignement result file
+    :return: Alignment result file
     """
 
-    target_db = create_mmseqs_db([target_seq_file], tmpdir, basename="target_sequences")
-    query_db = create_mmseqs_db(query_seq_files, tmpdir, basename="query_sequences")
-
     if is_target_nt:
-        logging.getLogger().debug(
-            f"Target sequences will be translated by mmseqs with translation table {translation_table}")
-        target_db = translate_with_mmseqs(target_db, translation_table, cpu, tmpdir)
+        logging.getLogger().debug("Target sequences will be translated by mmseqs with "
+                                  f"translation table {translation_table}")
+        target_db = translate_genes(target_seq_file, 'target_db', tmpdir, cpu, translation_table)
+    else:
+        target_db = create_mmseqs_db(target_seq_file, 'target_db', tmpdir, db_mode=0, db_type=1)
 
-    if is_query_nt:
-        logging.getLogger().debug(
-            f"Query sequences will be translated by mmseqs with translation table {translation_table}")
-        query_db = translate_with_mmseqs(query_db, translation_table, cpu, tmpdir)
+    with open(query_seq_files, 'r') as query_sequences:
+        if is_query_nt:
+            logging.getLogger().debug("Query sequences will be translated by mmseqs "
+                                      f"with translation table {translation_table}")
+            query_db = translate_genes(query_sequences, 'query_db', tmpdir, cpu, translation_table)
+        else:
+            query_db = create_mmseqs_db(query_sequences, 'query_db', tmpdir, db_mode=0, db_type=1)
 
     cov_mode = "2"  # coverage of query
     if no_defrag:
@@ -141,11 +101,11 @@ def map_input_gene_to_family_all_aln(aln_res: Path, outdir: Path,
     Read alignment result to link input sequences to pangenome gene family. 
     Alignment have been made against all genes of the pangenome.
 
-    :param aln_res: Alignement result file
+    :param aln_res: Alignment result file
     :param outdir: Output directory
     :param pangenome: Input pangenome
 
-    :return: Dictionnary with sequence link to pangenome gene families and actual path to the cleaned alignment file
+    :return: Dictionary with sequence link to pangenome gene families and actual path to the cleaned alignment file
     """
 
     seq2pang = {}
@@ -176,11 +136,11 @@ def map_input_gene_to_family_rep_aln(aln_res: Path, outdir: Path,
     Read alignment result to link input sequences to pangenome gene family.
     Alignment have been made against representative sequence of gene families of the pangenome.
 
-    :param aln_res: Alignement result file
+    :param aln_res: Alignment result file
     :param outdir: Output directory
     :param pangenome: Input pangenome
 
-    :return: Dictionnary with sequence link to pangenome gene families and actual path to the cleaned alignment file
+    :return: Dictionary with sequence link to pangenome gene families and actual path to the cleaned alignment file
     """
     seq2pang = {}
     aln_file_clean = outdir / "alignment_input_seqs_to_pangenome_gene_families.tsv"  # write the actual result file
@@ -231,7 +191,7 @@ def get_seq_ids(seq_file: TextIOWrapper) -> Tuple[Set[str], bool]:
     return seq_set, is_nucleotide
 
 
-def write_gene_fam_sequences(pangenome: Pangenome, file_obj: IO, add: str = "", disable_bar: bool = False):
+def write_gene_fam_sequences(pangenome: Pangenome, file_obj: TextIO, add: str = "", disable_bar: bool = False):
     """
     Export the sequence of gene families
 
@@ -247,7 +207,7 @@ def write_gene_fam_sequences(pangenome: Pangenome, file_obj: IO, add: str = "",
     # file_obj.flush()
 
 
-def write_all_gene_sequences(pangenome: Pangenome, file_obj: IO, add: str = "", disable_bar: bool = False):
+def write_all_gene_sequences(pangenome: Pangenome, file_obj: TextIO, add: str = "", disable_bar: bool = False):
     """
     Export the sequence of pangenome genes
 
@@ -269,7 +229,7 @@ def project_and_write_partition(seqid_to_gene_family: Dict[str, GeneFamily], seq
     """
     Project the partition of each sequence from the input file and write them in a file
 
-    :param seqid_to_gene_family: dictionnary which link sequence and pangenome gene family
+    :param seqid_to_gene_family: dictionary which link sequence and pangenome gene family
     :param seq_set: input sequences
     :param output: Path of the output directory
 
@@ -289,7 +249,7 @@ def write_gene_to_gene_family(seqid_to_gene_family: Dict[str, GeneFamily], seq_s
     """
     Write input gene to pangenome gene family.
 
-    :param seqid_to_gene_family: dictionnary which links input sequence and pangenome gene family
+    :param seqid_to_gene_family: dictionary which links input sequence and pangenome gene family
     :param seq_set: input sequences
     :param output: Path of the output directory
 
@@ -317,7 +277,7 @@ def get_fam_to_rgp(pangenome, multigenics: set) -> dict:
     :param pangenome: Input pangenome
     :param multigenics: multigenics families
 
-    :return: Dictionnary link families to RGP
+    :return: Dictionary link families to RGP
     """
     fam2rgp = defaultdict(list)
     for rgp in pangenome.regions:
@@ -376,7 +336,7 @@ def draw_spot_gexf(spots: set, output: Path, multigenics: set, fam_to_mod: dict,
     :param spots: spot find in the alignment between pangenome and input sequences
     :param output: Path of the output directory
     :param multigenics: multigenics families
-    :param fam_to_mod: dictionnary which link families and modules
+    :param fam_to_mod: dictionary which link families and modules
     :param set_size:
     """
     for spot in spots:
@@ -433,7 +393,7 @@ def get_seq_info(seq_to_pang: dict, pangenome: Pangenome, output: Path, draw_rel
                                          f"{output / 'info_input_seq.tsv'}")
 
 
-def get_input_seq_to_family_with_rep(pangenome: Pangenome, sequence_files: Iterable[Path], output: Path,
+def get_input_seq_to_family_with_rep(pangenome: Pangenome, sequence_files: Path, output: Path,
                                      tmpdir: Path, is_input_seq_nt: bool, cpu: int = 1, no_defrag: bool = False,
                                      identity: float = 0.8, coverage: float = 0.8, translation_table: int = 11,
                                      disable_bar: bool = False) -> Tuple[Path, Dict[str, GeneFamily]]:
@@ -465,7 +425,7 @@ def get_input_seq_to_family_with_rep(pangenome: Pangenome, sequence_files: Itera
         logging.getLogger().debug(f'Write gene family sequences in {tmp_pang_file.name}')
         write_gene_fam_sequences(pangenome, tmp_pang_file, add="ppanggolin_", disable_bar=disable_bar)
 
-        align_file = align_seq_to_pang(target_seq_file=Path(tmp_pang_file.name), query_seq_files=sequence_files,
+        align_file = align_seq_to_pang(target_seq_file=tmp_pang_file, query_seq_files=sequence_files,
                                        tmpdir=tmpdir, cpu=cpu,
                                        no_defrag=no_defrag, identity=identity, coverage=coverage,
                                        is_query_nt=is_input_seq_nt, is_target_nt=False,
@@ -476,7 +436,7 @@ def get_input_seq_to_family_with_rep(pangenome: Pangenome, sequence_files: Itera
     return align_file, seq2pang
 
 
-def get_input_seq_to_family_with_all(pangenome: Pangenome, sequence_files: Iterable[Path], output: Path,
+def get_input_seq_to_family_with_all(pangenome: Pangenome, sequence_files: Path, output: Path,
                                      tmpdir: Path, is_input_seq_nt: bool, cpu: int = 1, no_defrag: bool = False,
                                      identity: float = 0.8, coverage: float = 0.8, translation_table: int = 11,
                                      disable_bar: bool = False) -> Tuple[Path, Dict[str, GeneFamily]]:
@@ -507,7 +467,7 @@ def get_input_seq_to_family_with_all(pangenome: Pangenome, sequence_files: Itera
         logging.getLogger().debug(f'Write all pangenome gene sequences in {tmp_pang_file.name}')
         write_all_gene_sequences(pangenome, tmp_pang_file, add="ppanggolin_", disable_bar=disable_bar)
 
-        align_file = align_seq_to_pang(target_seq_file=Path(tmp_pang_file.name), query_seq_files=sequence_files,
+        align_file = align_seq_to_pang(target_seq_file=tmp_pang_file, query_seq_files=sequence_files,
                                        tmpdir=tmpdir, cpu=cpu,
                                        no_defrag=no_defrag, identity=identity, coverage=coverage,
                                        is_query_nt=is_input_seq_nt, is_target_nt=True,
@@ -564,14 +524,14 @@ def align(pangenome: Pangenome, sequence_file: Path, output: Path, identity: flo
     with create_tmpdir(main_dir=tmpdir, basename="align_input_seq_tmpdir", keep_tmp=keep_tmp) as new_tmpdir:
 
         if use_representatives:
-            align_file, seq2pang = get_input_seq_to_family_with_rep(pangenome, [sequence_file], output, new_tmpdir,
+            align_file, seq2pang = get_input_seq_to_family_with_rep(pangenome, sequence_file, output, new_tmpdir,
                                                                     is_input_seq_nt=is_nucleotide,
                                                                     cpu=cpu, no_defrag=no_defrag, identity=identity,
                                                                     coverage=coverage,
                                                                     translation_table=translation_table,
                                                                     disable_bar=disable_bar)
         else:
-            align_file, seq2pang = get_input_seq_to_family_with_all(pangenome=pangenome, sequence_files=[sequence_file],
+            align_file, seq2pang = get_input_seq_to_family_with_all(pangenome=pangenome, sequence_files=sequence_file,
                                                                     output=output, tmpdir=new_tmpdir,
                                                                     is_input_seq_nt=is_nucleotide,
                                                                     cpu=cpu, no_defrag=no_defrag, identity=identity,
diff --git a/ppanggolin/cluster/cluster.py b/ppanggolin/cluster/cluster.py
index 69d69f61..1f192370 100644
--- a/ppanggolin/cluster/cluster.py
+++ b/ppanggolin/cluster/cluster.py
@@ -85,7 +85,7 @@ def first_clustering(sequences: TextIO, tmpdir: Path, cpu: int = 1, code: int =
 
     :return: path to representative sequence file and path to tsv clustering result
     """
-    seqdb = translate_genes(sequences, tmpdir, cpu, code)
+    seqdb = translate_genes(sequences, 'aa_db', tmpdir, cpu, code)
     logging.getLogger("PPanGGOLiN").info("Clustering sequences...")
     cludb = tmpdir / 'cluster_db'
     cmd = list(map(str, ["mmseqs", "cluster", seqdb, cludb, tmpdir, "--cluster-mode", mode, "--min-seq-id",
diff --git a/ppanggolin/formats/writeSequences.py b/ppanggolin/formats/writeSequences.py
index b47b5581..92886427 100644
--- a/ppanggolin/formats/writeSequences.py
+++ b/ppanggolin/formats/writeSequences.py
@@ -134,7 +134,29 @@ def write_gene_sequences_from_annotations(genes_to_write: Iterable[Gene], file_o
     file_obj.flush()
 
 
-def translate_genes(sequences: TextIO, tmpdir: Path, threads: int = 1, code: int = 11) -> Path:
+def create_mmseqs_db(sequences: TextIO, db_name: str, tmpdir: Path, db_mode: int = 0, db_type: int = 0) -> Path:
+    """Create a MMseqs2 database from a sequences file.
+
+    :param sequences: File with the sequences
+    :param tmpdir: Temporary directory to save the MMSeqs2 files
+    :param db_mode: Createdb mode 0: copy data, 1: soft link data and write new index (works only with single line fasta/q)
+    :param db_type: Database type 0: auto, 1: amino acid 2: nucleotides
+
+    :return: Path to the MMSeqs2 database
+    """
+    assert db_mode in [0, 1], f"Createdb mode must be 0 or 1, given {db_mode}"
+    assert db_type in [0, 1, 2], f"dbtype must be 0, 1 or 2, given {db_type}"
+
+    seq_nucdb = tmpdir / db_name
+    cmd = list(map(str, ["mmseqs", "createdb", "--createdb-mode", db_mode,
+                         "--dbtype", db_type, sequences.name, seq_nucdb]))
+    logging.getLogger("PPanGGOLiN").debug(" ".join(cmd))
+    logging.getLogger("PPanGGOLiN").info("Creating sequence database...")
+    subprocess.run(cmd, stdout=subprocess.DEVNULL, check=True)
+    return seq_nucdb
+
+
+def translate_genes(sequences: TextIO, db_name: str, tmpdir: Path, threads: int = 1, code: int = 11) -> Path:
     """Translate nucleotide sequences into MMSeqs2 amino acid sequences database
 
     :param sequences: File with the nucleotide sequences
@@ -144,13 +166,9 @@ def translate_genes(sequences: TextIO, tmpdir: Path, threads: int = 1, code: int
 
     :return: Path to the MMSeqs2 database
     """
-    seq_nucdb = tmpdir / 'nucleotide_sequences_db'
-    cmd = list(map(str, ["mmseqs", "createdb", "--createdb-mode", 1, sequences.name, seq_nucdb]))
-    logging.getLogger("PPanGGOLiN").debug(" ".join(cmd))
-    logging.getLogger("PPanGGOLiN").info("Creating sequence database...")
-    subprocess.run(cmd, stdout=subprocess.DEVNULL, check=True)
+    seq_nucdb = create_mmseqs_db(sequences, 'nucleotide_sequences_db', tmpdir)
     logging.getLogger("PPanGGOLiN").debug("Translate sequence ...")
-    seqdb = tmpdir / 'aa_db'
+    seqdb = tmpdir / db_name
     cmd = list(map(str, ["mmseqs", "translatenucs", seq_nucdb, seqdb, "--threads", threads, "--translation-table", code]))
     logging.getLogger("PPanGGOLiN").debug(" ".join(cmd))
     subprocess.run(cmd, stdout=subprocess.DEVNULL, check=True)
@@ -222,7 +240,7 @@ def write_gene_protein_sequences(pangenome: Pangenome, output: Path, genes_prot:
     write_gene_sequences(pangenome, tmpdir, genes_prot, soft_core, compress, disable_bar)
 
     with open(tmpdir / f"{genes_prot}_genes.fna") as sequences:
-        translate_db = translate_genes(sequences, tmpdir, threads, code)
+        translate_db = translate_genes(sequences, 'aa_db', tmpdir, threads, code)
     outpath = output / f"{genes_prot}_protein_genes.fna"
     cmd = list(map(str, ["mmseqs", "convert2fasta", translate_db, outpath]))
     logging.getLogger("PPanGGOLiN").debug(" ".join(cmd))
@@ -619,7 +637,7 @@ def parser_seq(parser: argparse.ArgumentParser):
                                        )
     onereq.add_argument("--genes", required=False, type=filter_values,
                         help=f"Write all nucleotide CDS sequences. {poss_values_log}")
-    onereq.add_argument("--genes_prot", required=False, type=filter_values,
+    onereq.add_argument("--proteins", required=False, type=filter_values,
                         help=f"Write representative amino acid sequences of genes. {poss_values_log}")
     onereq.add_argument("--prot_families", required=False, type=filter_values,
                         help=f"Write representative amino acid sequences of gene families. {poss_values_log}")

From 4cb53d158f8148447f4aa91f4f49d10a3cb3f559 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= <arnoux.jeromepj@gmail.com>
Date: Tue, 4 Jun 2024 16:28:27 +0200
Subject: [PATCH 13/25] Add a soft link in mmseqs createdb

---
 ppanggolin/align/alignOnPang.py      | 103 +++++++++++++++------------
 ppanggolin/cluster/cluster.py        |   2 +-
 ppanggolin/formats/writeSequences.py |  13 ++--
 3 files changed, 69 insertions(+), 49 deletions(-)

diff --git a/ppanggolin/align/alignOnPang.py b/ppanggolin/align/alignOnPang.py
index c4c7bc56..a59739d8 100644
--- a/ppanggolin/align/alignOnPang.py
+++ b/ppanggolin/align/alignOnPang.py
@@ -9,7 +9,7 @@
 import subprocess
 import argparse
 from collections import defaultdict, Counter
-from typing import List, Tuple, Set, Dict, TextIO, Iterable
+from typing import List, Tuple, Set, Dict, TextIO
 from pathlib import Path
 
 from tqdm import tqdm
@@ -25,10 +25,10 @@
 from ppanggolin.formats.writeSequences import translate_genes, create_mmseqs_db
 
 
-def align_seq_to_pang(target_seq_file: TextIO, query_seq_files: Path,
-                      tmpdir: Path, cpu: int = 1, no_defrag: bool = False,
-                      identity: float = 0.8, coverage: float = 0.8,
-                      is_query_nt: bool = False, is_target_nt: bool = False, translation_table: int = None) -> Path:
+def align_seq_to_pang(target_seq_file: TextIO, query_seq_files: Path, tmpdir: Path, cpu: int = 1,
+                      no_defrag: bool = False, identity: float = 0.8, coverage: float = 0.8,
+                      is_query_nt: bool = False, is_query_slf: bool = False, is_target_nt: bool = False,
+                      is_target_slf: bool = False, translation_table: int = None) -> Path:
     """
     Align fasta sequence to pangenome sequences. 
 
@@ -40,26 +40,30 @@ def align_seq_to_pang(target_seq_file: TextIO, query_seq_files: Path,
     :param identity: minimal identity threshold for the alignment
     :param coverage: minimal identity threshold for the alignment
     :param is_query_nt: Is the sequence file (query) are nucleotide sequences. If True, sequences are translated by mmseqs
+    :param is_query_slf: Is the sequence file (query) with single line fasta. If True, MMSeqs2 database will be with soft link
     :param is_target_nt: Is the sequences of pangenome (target) are nucleotide sequences. If True, sequences are translated by mmseqs
+    :param is_target_slf: Is the sequences of pangenome (target) with single line fasta. If True, MMSeqs2 database will be with soft link
     :param translation_table: Translation table to use, if sequences are nucleotide and need to be translated.
 
     :return: Alignment result file
     """
 
     if is_target_nt:
-        logging.getLogger().debug("Target sequences will be translated by mmseqs with "
-                                  f"translation table {translation_table}")
-        target_db = translate_genes(target_seq_file, 'target_db', tmpdir, cpu, translation_table)
+        logging.getLogger("PPanGGOLiN").debug("Target sequences will be translated by mmseqs with "
+                                              f"translation table {translation_table}")
+        target_db = translate_genes(target_seq_file, 'target_db', tmpdir, cpu, is_target_slf, translation_table)
     else:
-        target_db = create_mmseqs_db(target_seq_file, 'target_db', tmpdir, db_mode=0, db_type=1)
+        target_db = create_mmseqs_db(target_seq_file, 'target_db', tmpdir,
+                                     db_mode=1 if is_target_slf else 0, db_type=1)
 
     with open(query_seq_files, 'r') as query_sequences:
         if is_query_nt:
-            logging.getLogger().debug("Query sequences will be translated by mmseqs "
-                                      f"with translation table {translation_table}")
-            query_db = translate_genes(query_sequences, 'query_db', tmpdir, cpu, translation_table)
+            logging.getLogger("PPanGGOLiN").debug("Query sequences will be translated by mmseqs "
+                                                  f"with translation table {translation_table}")
+            query_db = translate_genes(query_sequences, 'query_db', tmpdir, cpu, is_query_slf, translation_table)
         else:
-            query_db = create_mmseqs_db(query_sequences, 'query_db', tmpdir, db_mode=0, db_type=1)
+            query_db = create_mmseqs_db(query_sequences, 'query_db', tmpdir,
+                                        db_mode=1 if is_query_slf else 0, db_type=1)
 
     cov_mode = "2"  # coverage of query
     if no_defrag:
@@ -75,21 +79,21 @@ def align_seq_to_pang(target_seq_file: TextIO, query_seq_files: Path,
                "-c", str(coverage), "--cov-mode", cov_mode, "--threads", str(cpu),
                "--seed-sub-mat", "VTML40.out", "-s", "2", '--comp-bias-corr', "0", "--mask", "0", "-e", "1"]
 
-        logging.getLogger().info("Aligning sequences")
-        logging.getLogger().debug(" ".join(cmd))
+        logging.getLogger("PPanGGOLiN").info("Aligning sequences")
+        logging.getLogger("PPanGGOLiN").debug(" ".join(cmd))
 
         start = time.time()
         subprocess.run(cmd, stdout=subprocess.DEVNULL, check=True)
         align_time = time.time() - start
-        logging.getLogger().info(f"Done aligning sequences in {round(align_time, 2)} seconds")
+        logging.getLogger("PPanGGOLiN").info(f"Done aligning sequences in {round(align_time, 2)} seconds")
 
         with tempfile.NamedTemporaryFile(mode="w", dir=tmpdir, prefix="aln_result_db_file", suffix=".tsv",
                                          delete=False) as outfile:
             cmd = ["mmseqs", "convertalis", query_db.as_posix(), target_db.as_posix(), aln_db.name, outfile.name,
                    "--format-mode", "2"]
 
-            logging.getLogger().info("Extracting alignments...")
-            logging.getLogger().debug(" ".join(cmd))
+            logging.getLogger("PPanGGOLiN").info("Extracting alignments...")
+            logging.getLogger("PPanGGOLiN").debug(" ".join(cmd))
             subprocess.run(cmd, stdout=subprocess.DEVNULL, check=True)
 
     return Path(outfile.name)
@@ -110,7 +114,7 @@ def map_input_gene_to_family_all_aln(aln_res: Path, outdir: Path,
 
     seq2pang = {}
     aln_file_clean = outdir / "alignment_input_seqs_to_all_pangenome_genes.tsv"  # write the actual result file
-    logging.getLogger().debug(f'Writing alignment file in {aln_file_clean}')
+    logging.getLogger("PPanGGOLiN").debug(f'Writing alignment file in {aln_file_clean}')
 
     with open(aln_res, "r") as alnFile, open(aln_file_clean, "w") as aln_outfl:
         for line in alnFile:
@@ -145,7 +149,7 @@ def map_input_gene_to_family_rep_aln(aln_res: Path, outdir: Path,
     seq2pang = {}
     aln_file_clean = outdir / "alignment_input_seqs_to_pangenome_gene_families.tsv"  # write the actual result file
 
-    logging.getLogger().debug(f'Writing alignment file in {aln_file_clean}')
+    logging.getLogger("PPanGGOLiN").debug(f'Writing alignment file in {aln_file_clean}')
 
     with open(aln_res, "r") as alnFile, open(aln_file_clean, "w") as aln_outfl:
         for line in alnFile:
@@ -165,7 +169,7 @@ def map_input_gene_to_family_rep_aln(aln_res: Path, outdir: Path,
     return seq2pang, aln_file_clean
 
 
-def get_seq_ids(seq_file: TextIOWrapper) -> Tuple[Set[str], bool]:
+def get_seq_ids(seq_file: TextIOWrapper) -> Tuple[Set[str], bool, bool]:
     """
     Get sequence IDs from a sequence input file in FASTA format and guess the sequence type based on the first sequences.
 
@@ -177,18 +181,23 @@ def get_seq_ids(seq_file: TextIOWrapper) -> Tuple[Set[str], bool]:
     seq_set = set()
     seq_count = 0
     first_seq_concat = ""
-
+    single_line_fasta = False
+    count_fasta_line = 0
     for line in seq_file:
         if line.startswith(">"):
             seq_set.add(line[1:].split()[0].strip())
             seq_count += 1
+            if count_fasta_line > 1:  # Allow to know if we can use soft link with createdb from MMSeqs2
+                single_line_fasta = True
+            count_fasta_line = 0
         elif seq_count <= 20:
             first_seq_concat += line.strip()
+        count_fasta_line += 1
 
     char_counter = Counter(first_seq_concat)
     is_nucleotide = all(char in dna_expected_char for char in char_counter)
 
-    return seq_set, is_nucleotide
+    return seq_set, is_nucleotide, single_line_fasta
 
 
 def write_gene_fam_sequences(pangenome: Pangenome, file_obj: TextIO, add: str = "", disable_bar: bool = False):
@@ -394,7 +403,8 @@ def get_seq_info(seq_to_pang: dict, pangenome: Pangenome, output: Path, draw_rel
 
 
 def get_input_seq_to_family_with_rep(pangenome: Pangenome, sequence_files: Path, output: Path,
-                                     tmpdir: Path, is_input_seq_nt: bool, cpu: int = 1, no_defrag: bool = False,
+                                     tmpdir: Path, is_input_seq_nt: bool, is_input_slf: bool = False,
+                                     cpu: int = 1, no_defrag: bool = False,
                                      identity: float = 0.8, coverage: float = 0.8, translation_table: int = 11,
                                      disable_bar: bool = False) -> Tuple[Path, Dict[str, GeneFamily]]:
     """
@@ -407,7 +417,8 @@ def get_input_seq_to_family_with_rep(pangenome: Pangenome, sequence_files: Path,
     :param sequence_files: Iterable of paths of FASTA files containing input sequences to align.
     :param output: Path to the output directory where alignment results will be stored.
     :param tmpdir: Temporary directory for intermediate files.
-    :param is_input_seq_nt: Is input sequence file nucleotide sequences. 
+    :param is_input_seq_nt: Is input sequence file nucleotide sequences.
+    :param is_input_slf: Is the sequence file with single line fasta. If True, MMSeqs2 database will be with soft link
     :param cpu: Number of CPU cores to use for the alignment (default: 1).
     :param no_defrag: If True, the defragmentation workflow is skipped (default: False).
     :param identity: Minimum identity threshold for the alignment (default: 0.8).
@@ -422,23 +433,24 @@ def get_input_seq_to_family_with_rep(pangenome: Pangenome, sequence_files: Path,
     # delete False to be able to keep tmp file. If they are not keep tmpdir will be destroyed so no need to delete tmpfile
     with tempfile.NamedTemporaryFile(mode="w", dir=tmpdir.as_posix(), delete=False,
                                      prefix="representative_genes", suffix=".faa") as tmp_pang_file:
-        logging.getLogger().debug(f'Write gene family sequences in {tmp_pang_file.name}')
+        logging.getLogger("PPanGGOLiN").debug(f'Write gene family sequences in {tmp_pang_file.name}')
         write_gene_fam_sequences(pangenome, tmp_pang_file, add="ppanggolin_", disable_bar=disable_bar)
 
         align_file = align_seq_to_pang(target_seq_file=tmp_pang_file, query_seq_files=sequence_files,
                                        tmpdir=tmpdir, cpu=cpu,
                                        no_defrag=no_defrag, identity=identity, coverage=coverage,
-                                       is_query_nt=is_input_seq_nt, is_target_nt=False,
-                                       translation_table=translation_table)
+                                       is_query_nt=is_input_seq_nt, is_query_slf=is_input_slf,
+                                       is_target_nt=False, is_target_slf=False, translation_table=translation_table)
 
         seq2pang, align_file = map_input_gene_to_family_rep_aln(align_file, output, pangenome)
 
     return align_file, seq2pang
 
 
-def get_input_seq_to_family_with_all(pangenome: Pangenome, sequence_files: Path, output: Path,
-                                     tmpdir: Path, is_input_seq_nt: bool, cpu: int = 1, no_defrag: bool = False,
-                                     identity: float = 0.8, coverage: float = 0.8, translation_table: int = 11,
+def get_input_seq_to_family_with_all(pangenome: Pangenome, sequence_files: Path, output: Path, tmpdir: Path,
+                                     is_input_seq_nt: bool, is_input_slf: bool = False,
+                                     cpu: int = 1, no_defrag: bool = False, identity: float = 0.8,
+                                     coverage: float = 0.8, translation_table: int = 11,
                                      disable_bar: bool = False) -> Tuple[Path, Dict[str, GeneFamily]]:
     """
     Assign gene families from a pangenome to input sequences.
@@ -450,7 +462,8 @@ def get_input_seq_to_family_with_all(pangenome: Pangenome, sequence_files: Path,
     :param sequence_files: Iterable of paths of FASTA files containing input sequences to align.
     :param output: Path to the output directory where alignment results will be stored.
     :param tmpdir: Temporary directory for intermediate files.
-    :param is_input_seq_nt: Is input sequence file nucleotide sequences. 
+    :param is_input_seq_nt: Is input sequence file nucleotide sequences.
+    :param is_input_slf: Is the sequence file with single line fasta. If True, MMSeqs2 database will be with soft link
     :param cpu: Number of CPU cores to use for the alignment (default: 1).
     :param no_defrag: If True, the defragmentation workflow is skipped (default: False).
     :param identity: Minimum identity threshold for the alignment (default: 0.8).
@@ -464,14 +477,13 @@ def get_input_seq_to_family_with_all(pangenome: Pangenome, sequence_files: Path,
 
     with tempfile.NamedTemporaryFile(mode="w", dir=tmpdir.as_posix(), delete=False,
                                      prefix="all_pangenome_genes", suffix=".fna") as tmp_pang_file:
-        logging.getLogger().debug(f'Write all pangenome gene sequences in {tmp_pang_file.name}')
+        logging.getLogger("PPanGGOLiN").debug(f'Write all pangenome gene sequences in {tmp_pang_file.name}')
         write_all_gene_sequences(pangenome, tmp_pang_file, add="ppanggolin_", disable_bar=disable_bar)
 
-        align_file = align_seq_to_pang(target_seq_file=tmp_pang_file, query_seq_files=sequence_files,
-                                       tmpdir=tmpdir, cpu=cpu,
-                                       no_defrag=no_defrag, identity=identity, coverage=coverage,
-                                       is_query_nt=is_input_seq_nt, is_target_nt=True,
-                                       translation_table=translation_table)
+        align_file = align_seq_to_pang(target_seq_file=tmp_pang_file, query_seq_files=sequence_files, tmpdir=tmpdir,
+                                       cpu=cpu, no_defrag=no_defrag, identity=identity, coverage=coverage,
+                                       is_query_nt=is_input_seq_nt, is_query_slf=is_input_slf, is_target_nt=True,
+                                       is_target_slf=True, translation_table=translation_table)
 
         seq2pang, align_file = map_input_gene_to_family_all_aln(align_file, output, pangenome)
 
@@ -480,7 +492,7 @@ def get_input_seq_to_family_with_all(pangenome: Pangenome, sequence_files: Path,
 
 def align(pangenome: Pangenome, sequence_file: Path, output: Path, identity: float = 0.8,
           coverage: float = 0.8, no_defrag: bool = False, cpu: int = 1, getinfo: bool = False,
-          use_representatives: bool = False, draw_related: bool = False, translation_table: int = 11, 
+          use_representatives: bool = False, draw_related: bool = False, translation_table: int = 11,
           tmpdir: Path = None, disable_bar: bool = False, keep_tmp=False):
     """
     Aligns pangenome sequences with sequences in a FASTA file using MMSeqs2.
@@ -519,13 +531,14 @@ def align(pangenome: Pangenome, sequence_file: Path, output: Path, identity: flo
         check_pangenome_info(pangenome, need_families=True, disable_bar=disable_bar)
 
     with read_compressed_or_not(sequence_file) as seqFileObj:
-        seq_set, is_nucleotide = get_seq_ids(seqFileObj)
+        seq_set, is_nucleotide, single_line_fasta = get_seq_ids(seqFileObj)
 
     with create_tmpdir(main_dir=tmpdir, basename="align_input_seq_tmpdir", keep_tmp=keep_tmp) as new_tmpdir:
 
         if use_representatives:
             align_file, seq2pang = get_input_seq_to_family_with_rep(pangenome, sequence_file, output, new_tmpdir,
                                                                     is_input_seq_nt=is_nucleotide,
+                                                                    is_input_slf=single_line_fasta,
                                                                     cpu=cpu, no_defrag=no_defrag, identity=identity,
                                                                     coverage=coverage,
                                                                     translation_table=translation_table,
@@ -534,6 +547,7 @@ def align(pangenome: Pangenome, sequence_file: Path, output: Path, identity: flo
             align_file, seq2pang = get_input_seq_to_family_with_all(pangenome=pangenome, sequence_files=sequence_file,
                                                                     output=output, tmpdir=new_tmpdir,
                                                                     is_input_seq_nt=is_nucleotide,
+                                                                    is_input_slf=single_line_fasta,
                                                                     cpu=cpu, no_defrag=no_defrag, identity=identity,
                                                                     coverage=coverage,
                                                                     translation_table=translation_table,
@@ -543,9 +557,10 @@ def align(pangenome: Pangenome, sequence_file: Path, output: Path, identity: flo
         get_seq_info(seq2pang, pangenome, output, draw_related, disable_bar=disable_bar)
 
     part_proj = project_and_write_partition(seq2pang, seq_set, output)  # write the partition assignation only
-    logging.getLogger().info(f"sequences partition projection : '{part_proj}'")
-    logging.getLogger().info(f"{len(seq2pang)} sequences over {len(seq_set)} have at least one hit in the pangenome.")
-    logging.getLogger().info(f"Blast-tab file of the alignment : '{align_file}'")
+    logging.getLogger("PPanGGOLiN").info(f"sequences partition projection : '{part_proj}'")
+    logging.getLogger("PPanGGOLiN").info(
+        f"{len(seq2pang)} sequences over {len(seq_set)} have at least one hit in the pangenome.")
+    logging.getLogger("PPanGGOLiN").info(f"Blast-tab file of the alignment : '{align_file}'")
 
 
 def launch(args: argparse.Namespace):
@@ -617,7 +632,7 @@ def parser_align(parser: argparse.ArgumentParser):
                           help="In the context of provided annotation, use this option to read pseudogenes. "
                                "(Default behavior is to ignore them)")
     optional.add_argument("-c", "--cpu", required=False, default=1, type=int, help="Number of available cpus")
-    optional.add_argument("--tmpdir", required=False, type=str, default=Path(tempfile.gettempdir()),
+    optional.add_argument("--tmpdir", required=False, type=Path, default=Path(tempfile.gettempdir()),
                           help="directory for storing temporary files")
     optional.add_argument("--keep_tmp", required=False, default=False, action="store_true",
                           help="Keeping temporary files (useful for debugging).")
diff --git a/ppanggolin/cluster/cluster.py b/ppanggolin/cluster/cluster.py
index 1f192370..85d99ebc 100644
--- a/ppanggolin/cluster/cluster.py
+++ b/ppanggolin/cluster/cluster.py
@@ -85,7 +85,7 @@ def first_clustering(sequences: TextIO, tmpdir: Path, cpu: int = 1, code: int =
 
     :return: path to representative sequence file and path to tsv clustering result
     """
-    seqdb = translate_genes(sequences, 'aa_db', tmpdir, cpu, code)
+    seqdb = translate_genes(sequences, 'aa_db', tmpdir, cpu, True, code)
     logging.getLogger("PPanGGOLiN").info("Clustering sequences...")
     cludb = tmpdir / 'cluster_db'
     cmd = list(map(str, ["mmseqs", "cluster", seqdb, cludb, tmpdir, "--cluster-mode", mode, "--min-seq-id",
diff --git a/ppanggolin/formats/writeSequences.py b/ppanggolin/formats/writeSequences.py
index 92886427..24d1ab76 100644
--- a/ppanggolin/formats/writeSequences.py
+++ b/ppanggolin/formats/writeSequences.py
@@ -138,6 +138,7 @@ def create_mmseqs_db(sequences: TextIO, db_name: str, tmpdir: Path, db_mode: int
     """Create a MMseqs2 database from a sequences file.
 
     :param sequences: File with the sequences
+    :param db_name: name of the database
     :param tmpdir: Temporary directory to save the MMSeqs2 files
     :param db_mode: Createdb mode 0: copy data, 1: soft link data and write new index (works only with single line fasta/q)
     :param db_type: Database type 0: auto, 1: amino acid 2: nucleotides
@@ -156,17 +157,21 @@ def create_mmseqs_db(sequences: TextIO, db_name: str, tmpdir: Path, db_mode: int
     return seq_nucdb
 
 
-def translate_genes(sequences: TextIO, db_name: str, tmpdir: Path, threads: int = 1, code: int = 11) -> Path:
+def translate_genes(sequences: TextIO, db_name: str, tmpdir: Path, threads: int = 1,
+                    is_single_line_fasta: bool = False, code: int = 11) -> Path:
     """Translate nucleotide sequences into MMSeqs2 amino acid sequences database
 
     :param sequences: File with the nucleotide sequences
+    :param db_name: name of the output database
     :param tmpdir: Temporary directory to save the MMSeqs2 files
     :param threads: Number of available threads to use
+    :param is_single_line_fasta: Allow to use soft link in MMSeqs2 database
     :param code: Translation code to use
 
     :return: Path to the MMSeqs2 database
     """
-    seq_nucdb = create_mmseqs_db(sequences, 'nucleotide_sequences_db', tmpdir)
+    seq_nucdb = create_mmseqs_db(sequences, 'nucleotide_sequences_db', tmpdir,
+                                 db_mode=1 if is_single_line_fasta else 0, db_type=2)
     logging.getLogger("PPanGGOLiN").debug("Translate sequence ...")
     seqdb = tmpdir / db_name
     cmd = list(map(str, ["mmseqs", "translatenucs", seq_nucdb, seqdb, "--threads", threads, "--translation-table", code]))
@@ -196,7 +201,7 @@ def write_gene_sequences(pangenome: Pangenome, output: Path, genes: str, soft_co
     genefams = set()
     genes_to_write = []
     if genes == "rgp":
-        logging.getLogger("PPanGGOLiN").info(f"Writing the gene nucleotide sequences in RGPs...")
+        logging.getLogger("PPanGGOLiN").info("Writing the gene nucleotide sequences in RGPs...")
         for region in pangenome.regions:
             genes_to_write.extend(region.genes)
     else:
@@ -240,7 +245,7 @@ def write_gene_protein_sequences(pangenome: Pangenome, output: Path, genes_prot:
     write_gene_sequences(pangenome, tmpdir, genes_prot, soft_core, compress, disable_bar)
 
     with open(tmpdir / f"{genes_prot}_genes.fna") as sequences:
-        translate_db = translate_genes(sequences, 'aa_db', tmpdir, threads, code)
+        translate_db = translate_genes(sequences, 'aa_db', tmpdir, threads, True, code)
     outpath = output / f"{genes_prot}_protein_genes.fna"
     cmd = list(map(str, ["mmseqs", "convert2fasta", translate_db, outpath]))
     logging.getLogger("PPanGGOLiN").debug(" ".join(cmd))

From 326403f3a4178960bb7be5a052bc4667069f82da Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= <arnoux.jeromepj@gmail.com>
Date: Tue, 4 Jun 2024 16:36:35 +0200
Subject: [PATCH 14/25] Replace add_spot_str function by Spot class method
 __str__

---
 ppanggolin/align/alignOnPang.py | 16 ++--------------
 1 file changed, 2 insertions(+), 14 deletions(-)

diff --git a/ppanggolin/align/alignOnPang.py b/ppanggolin/align/alignOnPang.py
index a59739d8..9eef70a6 100644
--- a/ppanggolin/align/alignOnPang.py
+++ b/ppanggolin/align/alignOnPang.py
@@ -326,18 +326,6 @@ def get_fam_to_spot(pangenome: Pangenome, multigenics: Set[GeneFamily]) \
     return fam2spot, fam2border
 
 
-def add_spot_str(spot: Spot) -> str:
-    # TODO define as self.__str__ in spot
-    """
-    allow to map spot set
-
-    :param spot: spot which will be return
-
-    :return: Str with spot ID
-    """
-    return "spot_" + str(spot.ID)
-
-
 def draw_spot_gexf(spots: set, output: Path, multigenics: set, fam_to_mod: dict, set_size: int = 3):
     """
     Draw a gexf graph of the spot
@@ -374,8 +362,8 @@ def get_seq_info(seq_to_pang: dict, pangenome: Pangenome, output: Path, draw_rel
     spot_list = set()
     for seq, panfam in seq_to_pang.items():
         finfo.write(seq + '\t' + panfam.name + "\t" + panfam.named_partition + "\t" + ",".join(
-            map(add_spot_str, fam2spot[panfam])) + "\t" + ",".join(
-            map(add_spot_str, fam2border[panfam])) + "\t" + ','.join(fam2rgp[panfam]) + "\n")
+            map(str, fam2spot[panfam])) + "\t" + ",".join(
+            map(str, fam2border[panfam])) + "\t" + ','.join(fam2rgp[panfam]) + "\n")
         spot_list |= set(fam2spot[panfam])
         spot_list |= set(fam2border[panfam])
     finfo.close()

From 7e09b19f38b19c8b58183435e76964f429efa934 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= <arnoux.jeromepj@gmail.com>
Date: Wed, 5 Jun 2024 10:16:34 +0200
Subject: [PATCH 15/25] Fix bug in projection and refactoring

---
 .github/workflows/main.yml              |  4 +-
 docs/user/writeFasta.md                 |  2 +-
 ppanggolin/align/alignOnPang.py         | 53 +++++++++++++------------
 ppanggolin/cluster/cluster.py           |  7 ++--
 ppanggolin/context/searchGeneContext.py | 11 +++--
 ppanggolin/figures/draw_spot.py         | 21 +++++-----
 ppanggolin/formats/writeSequences.py    | 42 ++++++++++----------
 ppanggolin/projection/projection.py     |  8 ++--
 8 files changed, 76 insertions(+), 72 deletions(-)

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index cb5b0f47..94e5f44f 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -89,7 +89,7 @@ jobs:
         ppanggolin rarefaction --output stepbystep -f -p stepbystep/pangenome.h5 --depth 5 --min 1 --max 50 -ms 10 -fd -ck 30 -K 3 --soft_core 0.9 -se $RANDOM
         ppanggolin draw -p stepbystep/pangenome.h5 --tile_plot --nocloud --soft_core 0.92 --ucurve --output stepbystep -f
         ppanggolin rgp -p stepbystep/pangenome.h5 --persistent_penalty 2 --variable_gain 1 --min_score 3 --dup_margin 0.05
-        ppanggolin spot -p stepbystep/pangenome.h5 --spot_graph --overlapping_match 2 --set_size 3 --exact_match_size 1
+        ppanggolin spot -p stepbystep/pangenome.h5 --output stepbystep --spot_graph --overlapping_match 2 --set_size 3 --exact_match_size 1 -f
         ppanggolin draw -p stepbystep/pangenome.h5 --draw_spots -o stepbystep -f
         ppanggolin module -p stepbystep/pangenome.h5 --transitive 4 --size 3 --jaccard 0.86 --dup_margin 0.05
         ppanggolin write_pangenome -p stepbystep/pangenome.h5 --output stepbystep -f --soft_core 0.9 --dup_margin 0.06  --gexf --light_gexf --csv --Rtab --stats --partitions --compress --json --spots --regions --borders --families_tsv --cpu 1 
@@ -100,7 +100,7 @@ jobs:
         ppanggolin fasta -p stepbystep/pangenome.h5 --output stepbystep -f --prot_families module_0
         ppanggolin fasta -p stepbystep/pangenome.h5 --output stepbystep -f --prot_families core
         ppanggolin fasta -p stepbystep/pangenome.h5 --output stepbystep -f --gene_families module_0 --genes module_0
-        ppanggolin fasta -p stepbystep/pangenome.h5 --output stepbystep -f --genes_prot cloud --threads 1 --keep_tmp 
+        ppanggolin fasta -p stepbystep/pangenome.h5 --output stepbystep -f --proteins cloud --threads 1 --keep_tmp 
 
         ppanggolin draw -p stepbystep/pangenome.h5 --draw_spots --spots all -o stepbystep -f
         ppanggolin metrics -p stepbystep/pangenome.h5 --genome_fluidity --no_print_info --recompute_metrics --log metrics.log
diff --git a/docs/user/writeFasta.md b/docs/user/writeFasta.md
index e0ba5013..843f0788 100644
--- a/docs/user/writeFasta.md
+++ b/docs/user/writeFasta.md
@@ -37,7 +37,7 @@ ppanggolin fasta -p pangenome.h5 --output MY_GENES --genes persistent
 This option can be used to write the amino acid CDS sequences. It can be used as such, to write all the genes of the pangenome for example:
 
 ```bash
-ppanggolin fasta -p pangenome.h5 --output MY_GENES --genes_prot all
+ppanggolin fasta -p pangenome.h5 --output MY_GENES --proteins all
 ```
 
 Or to write only the cloud genes:
diff --git a/ppanggolin/align/alignOnPang.py b/ppanggolin/align/alignOnPang.py
index 9eef70a6..0742a876 100644
--- a/ppanggolin/align/alignOnPang.py
+++ b/ppanggolin/align/alignOnPang.py
@@ -9,7 +9,7 @@
 import subprocess
 import argparse
 from collections import defaultdict, Counter
-from typing import List, Tuple, Set, Dict, TextIO
+from typing import List, Tuple, Set, Dict, TextIO, Union, Iterable, Any
 from pathlib import Path
 
 from tqdm import tqdm
@@ -25,7 +25,7 @@
 from ppanggolin.formats.writeSequences import translate_genes, create_mmseqs_db
 
 
-def align_seq_to_pang(target_seq_file: TextIO, query_seq_files: Path, tmpdir: Path, cpu: int = 1,
+def align_seq_to_pang(target_seq_file: Union[Path, Iterable[Path]], query_seq_files: Union[Path, Iterable[Path]], tmpdir: Path, cpu: int = 1,
                       no_defrag: bool = False, identity: float = 0.8, coverage: float = 0.8,
                       is_query_nt: bool = False, is_query_slf: bool = False, is_target_nt: bool = False,
                       is_target_slf: bool = False, translation_table: int = None) -> Path:
@@ -53,17 +53,16 @@ def align_seq_to_pang(target_seq_file: TextIO, query_seq_files: Path, tmpdir: Pa
                                               f"translation table {translation_table}")
         target_db = translate_genes(target_seq_file, 'target_db', tmpdir, cpu, is_target_slf, translation_table)
     else:
-        target_db = create_mmseqs_db(target_seq_file, 'target_db', tmpdir,
-                                     db_mode=1 if is_target_slf else 0, db_type=1)
-
-    with open(query_seq_files, 'r') as query_sequences:
-        if is_query_nt:
-            logging.getLogger("PPanGGOLiN").debug("Query sequences will be translated by mmseqs "
-                                                  f"with translation table {translation_table}")
-            query_db = translate_genes(query_sequences, 'query_db', tmpdir, cpu, is_query_slf, translation_table)
-        else:
-            query_db = create_mmseqs_db(query_sequences, 'query_db', tmpdir,
-                                        db_mode=1 if is_query_slf else 0, db_type=1)
+        target_db = create_mmseqs_db([target_seq_file] if isinstance(target_seq_file, Path) else target_seq_file,
+                                     'target_db', tmpdir, db_mode=1 if is_target_slf else 0, db_type=1)
+
+    if is_query_nt:
+        logging.getLogger("PPanGGOLiN").debug("Query sequences will be translated by mmseqs "
+                                              f"with translation table {translation_table}")
+        query_db = translate_genes(query_seq_files, 'query_db', tmpdir, cpu, is_query_slf, translation_table)
+    else:
+        query_db = create_mmseqs_db([query_seq_files] if isinstance(query_seq_files, Path) else query_seq_files,
+                                    'query_db', tmpdir, db_mode=1 if is_query_slf else 0, db_type=1)
 
     cov_mode = "2"  # coverage of query
     if no_defrag:
@@ -135,7 +134,7 @@ def map_input_gene_to_family_all_aln(aln_res: Path, outdir: Path,
 
 
 def map_input_gene_to_family_rep_aln(aln_res: Path, outdir: Path,
-                                     pangenome: Pangenome) -> Tuple[Dict[str, GeneFamily], str]:
+                                     pangenome: Pangenome) -> Tuple[Dict[Any, GeneFamily], Path]:
     """
     Read alignment result to link input sequences to pangenome gene family.
     Alignment have been made against representative sequence of gene families of the pangenome.
@@ -390,11 +389,11 @@ def get_seq_info(seq_to_pang: dict, pangenome: Pangenome, output: Path, draw_rel
                                          f"{output / 'info_input_seq.tsv'}")
 
 
-def get_input_seq_to_family_with_rep(pangenome: Pangenome, sequence_files: Path, output: Path,
-                                     tmpdir: Path, is_input_seq_nt: bool, is_input_slf: bool = False,
-                                     cpu: int = 1, no_defrag: bool = False,
-                                     identity: float = 0.8, coverage: float = 0.8, translation_table: int = 11,
-                                     disable_bar: bool = False) -> Tuple[Path, Dict[str, GeneFamily]]:
+def get_input_seq_to_family_with_rep(pangenome: Pangenome, sequence_files: Union[Path, Iterable[Path]], output: Path,
+                                     tmpdir: Path, is_input_seq_nt: bool, is_input_slf: bool = False, cpu: int = 1,
+                                     no_defrag: bool = False, identity: float = 0.8, coverage: float = 0.8,
+                                     translation_table: int = 11, disable_bar: bool = False
+                                     ) -> Tuple[Path, Dict[str, GeneFamily]]:
     """
     Assign gene families from a pangenome to input sequences.
 
@@ -422,9 +421,10 @@ def get_input_seq_to_family_with_rep(pangenome: Pangenome, sequence_files: Path,
     with tempfile.NamedTemporaryFile(mode="w", dir=tmpdir.as_posix(), delete=False,
                                      prefix="representative_genes", suffix=".faa") as tmp_pang_file:
         logging.getLogger("PPanGGOLiN").debug(f'Write gene family sequences in {tmp_pang_file.name}')
+        pangenome_sequences = Path(tmp_pang_file.name)
         write_gene_fam_sequences(pangenome, tmp_pang_file, add="ppanggolin_", disable_bar=disable_bar)
 
-        align_file = align_seq_to_pang(target_seq_file=tmp_pang_file, query_seq_files=sequence_files,
+        align_file = align_seq_to_pang(target_seq_file=pangenome_sequences, query_seq_files=sequence_files,
                                        tmpdir=tmpdir, cpu=cpu,
                                        no_defrag=no_defrag, identity=identity, coverage=coverage,
                                        is_query_nt=is_input_seq_nt, is_query_slf=is_input_slf,
@@ -435,11 +435,11 @@ def get_input_seq_to_family_with_rep(pangenome: Pangenome, sequence_files: Path,
     return align_file, seq2pang
 
 
-def get_input_seq_to_family_with_all(pangenome: Pangenome, sequence_files: Path, output: Path, tmpdir: Path,
-                                     is_input_seq_nt: bool, is_input_slf: bool = False,
-                                     cpu: int = 1, no_defrag: bool = False, identity: float = 0.8,
-                                     coverage: float = 0.8, translation_table: int = 11,
-                                     disable_bar: bool = False) -> Tuple[Path, Dict[str, GeneFamily]]:
+def get_input_seq_to_family_with_all(pangenome: Pangenome, sequence_files:  Union[Path, Iterable[Path]], output: Path,
+                                     tmpdir: Path, is_input_seq_nt: bool, is_input_slf: bool = False, cpu: int = 1,
+                                     no_defrag: bool = False, identity: float = 0.8, coverage: float = 0.8,
+                                     translation_table: int = 11, disable_bar: bool = False
+                                     ) -> Tuple[Path, Dict[str, GeneFamily]]:
     """
     Assign gene families from a pangenome to input sequences.
 
@@ -466,9 +466,10 @@ def get_input_seq_to_family_with_all(pangenome: Pangenome, sequence_files: Path,
     with tempfile.NamedTemporaryFile(mode="w", dir=tmpdir.as_posix(), delete=False,
                                      prefix="all_pangenome_genes", suffix=".fna") as tmp_pang_file:
         logging.getLogger("PPanGGOLiN").debug(f'Write all pangenome gene sequences in {tmp_pang_file.name}')
+        pangenome_sequences = Path(tmp_pang_file.name)
         write_all_gene_sequences(pangenome, tmp_pang_file, add="ppanggolin_", disable_bar=disable_bar)
 
-        align_file = align_seq_to_pang(target_seq_file=tmp_pang_file, query_seq_files=sequence_files, tmpdir=tmpdir,
+        align_file = align_seq_to_pang(target_seq_file=pangenome_sequences, query_seq_files=sequence_files, tmpdir=tmpdir,
                                        cpu=cpu, no_defrag=no_defrag, identity=identity, coverage=coverage,
                                        is_query_nt=is_input_seq_nt, is_query_slf=is_input_slf, is_target_nt=True,
                                        is_target_slf=True, translation_table=translation_table)
diff --git a/ppanggolin/cluster/cluster.py b/ppanggolin/cluster/cluster.py
index 85d99ebc..639c578e 100644
--- a/ppanggolin/cluster/cluster.py
+++ b/ppanggolin/cluster/cluster.py
@@ -70,7 +70,7 @@ def check_pangenome_for_clustering(pangenome: Pangenome, tmp_file: TextIO, force
                         "or provide a way to access the gene sequence during the annotation step "
                         "(having the fasta in the gff files, or providing the fasta files through the --fasta option)")
 
-def first_clustering(sequences: TextIO, tmpdir: Path, cpu: int = 1, code: int = 11, coverage: float = 0.8,
+def first_clustering(sequences: Path, tmpdir: Path, cpu: int = 1, code: int = 11, coverage: float = 0.8,
                      identity: float = 0.8, mode: int = 1) -> Tuple[Path, Path]:
     """
     Make a first clustering of all sequences in pangenome
@@ -304,10 +304,11 @@ def clustering(pangenome: Pangenome, tmpdir: Path, cpu: int = 1, defrag: bool =
         newtmpdir = tempfile.TemporaryDirectory(dir=tmpdir)
         tmp_path = Path(newtmpdir.name)
 
-    with open(tmp_path/'nucleotid_sequences', "w") as sequence_file:
+    sequence_path = tmp_path/'nucleotid_sequences'
+    with open(sequence_path, "w") as sequence_file:
         check_pangenome_for_clustering(pangenome, sequence_file, force, disable_bar=disable_bar)
         logging.getLogger("PPanGGOLiN").info("Clustering all of the genes sequences...")
-        rep, tsv = first_clustering(sequence_file, tmp_path, cpu, code, coverage, identity, mode)
+        rep, tsv = first_clustering(sequence_path, tmp_path, cpu, code, coverage, identity, mode)
 
     fam2seq = read_faa(rep)
     if not defrag:
diff --git a/ppanggolin/context/searchGeneContext.py b/ppanggolin/context/searchGeneContext.py
index 5856997e..0312e4fa 100644
--- a/ppanggolin/context/searchGeneContext.py
+++ b/ppanggolin/context/searchGeneContext.py
@@ -69,24 +69,23 @@ def search_gene_context_in_pangenome(pangenome: Pangenome, output: Path, tmpdir:
     if sequence_file is not None:
         # Alignment of sequences on pangenome families
         with read_compressed_or_not(sequence_file) as seqFileObj:
-            seq_set, is_nucleotide = get_seq_ids(seqFileObj)
+            seq_set, is_nucleotide, is_slf = get_seq_ids(seqFileObj)
 
         logging.debug(f"Input sequences are {'nucleotide' if is_nucleotide else 'protein'} sequences")
 
         with create_tmpdir(main_dir=tmpdir, basename="align_input_seq_tmpdir", keep_tmp=keep_tmp) as new_tmpdir:
 
             if use_representatives:
-                _, seqid2fam = get_input_seq_to_family_with_rep(pangenome, [sequence_file], output,
-                                                                new_tmpdir, is_input_seq_nt=is_nucleotide,
+                _, seqid2fam = get_input_seq_to_family_with_rep(pangenome, sequence_file, output, new_tmpdir,
+                                                                is_input_seq_nt=is_nucleotide, is_input_slf=is_slf,
                                                                 cpu=cpu, no_defrag=no_defrag,
                                                                 identity=identity, coverage=coverage,
                                                                 translation_table=translation_table,
                                                                 disable_bar=disable_bar)
             else:
-                _, seqid2fam = get_input_seq_to_family_with_all(pangenome=pangenome,
-                                                                sequence_files=[sequence_file],
+                _, seqid2fam = get_input_seq_to_family_with_all(pangenome=pangenome, sequence_files=sequence_file,
                                                                 output=output, tmpdir=new_tmpdir,
-                                                                is_input_seq_nt=is_nucleotide,
+                                                                is_input_seq_nt=is_nucleotide, is_input_slf=is_slf,
                                                                 cpu=cpu, no_defrag=no_defrag,
                                                                 identity=identity, coverage=coverage,
                                                                 translation_table=translation_table,
diff --git a/ppanggolin/figures/draw_spot.py b/ppanggolin/figures/draw_spot.py
index dc27baa1..bbd65bb5 100644
--- a/ppanggolin/figures/draw_spot.py
+++ b/ppanggolin/figures/draw_spot.py
@@ -156,7 +156,7 @@ def line_order_gene_lists(gene_lists: list, overlapping_match: int, exact_match:
         new_classify = set()
 
 
-def subgraph(spot: Spot, outname: str, with_border: bool = True, set_size: int = 3,
+def subgraph(spot: Spot, outname: Path, with_border: bool = True, set_size: int = 3,
              multigenics: set = None, fam_to_mod: dict = None):
     """
     Write a pangeome subgraph of the gene families of a spot in gexf format
@@ -212,7 +212,7 @@ def subgraph(spot: Spot, outname: str, with_border: bool = True, set_size: int =
         if "name" in g.nodes[node]:
             g.nodes[node]["name"] = g.nodes[node]["name"].most_common(1)[0][0]
 
-    nx.write_gexf(g, outname)
+    nx.write_gexf(g, outname.absolute().as_posix())
 
 def is_gene_list_ordered(genes:List[Feature]):
     """
@@ -538,7 +538,7 @@ def add_genome_tools(fig, gene_recs: GlyphRenderer, genome_recs: GlyphRenderer,
     return column(genome_header, slider_spacing, slider_font, slider_offset)
 
 
-def draw_curr_spot(gene_lists: list, ordered_counts: list, fam_to_mod: dict, fam_col: dict, file_name: str):
+def draw_curr_spot(gene_lists: list, ordered_counts: list, fam_to_mod: dict, fam_col: dict, output: Path):
     """
 
     :param gene_lists:
@@ -550,7 +550,7 @@ def draw_curr_spot(gene_lists: list, ordered_counts: list, fam_to_mod: dict, fam
     """
 
     # Prepare the source data
-    output_file(file_name + ".html")
+    output_file(output)
 
     # generate the figure and add some tools to it
     wheel_zoom = WheelZoomTool()
@@ -608,10 +608,10 @@ def draw_selected_spots(selected_spots: Union[List[Spot], Set[Spot]], pangenome:
             fam2mod[fam] = f"module_{mod.ID}"
 
     for spot in tqdm(selected_spots, total=len(selected_spots), unit="spot", disable=disable_bar):
-        fname = output / f"spot_{str(spot.ID)}"
-
+        basename = f"spot_{str(spot.ID)}"
+        identical_rgp_out = output / (basename + '_identical_rgps.tsv')
         # write rgps representatives and the rgps they are identical to
-        with open(fname.absolute().as_posix() + '_identical_rgps.tsv', 'w') as out_struc:
+        with open(identical_rgp_out, 'w') as out_struc:
             out_struc.write('representative_rgp\trepresentative_rgp_genome\tidentical_rgp\tidentical_rgp_genome\n')
             for key_rgp, other_rgps in spot.get_uniq_to_rgp().items():
                 for rgp in other_rgps:
@@ -667,9 +667,10 @@ def draw_selected_spots(selected_spots: Union[List[Spot], Set[Spot]], pangenome:
                 uniq_gene_lists.append(genelist)
                 ordered_counts.append(curr_genelist_count)
 
-        draw_curr_spot(uniq_gene_lists, ordered_counts, fam2mod, famcolors, fname.absolute().as_posix())
-        subgraph(spot, fname.absolute().as_posix() + ".gexf", set_size=set_size,
-                 multigenics=multigenics, fam_to_mod=fam2mod)
+        draw_spot_out = output / (basename + ".html")
+        subgraph_out = output / (basename + ".gexf")
+        draw_curr_spot(uniq_gene_lists, ordered_counts, fam2mod, famcolors, draw_spot_out)
+        subgraph(spot, subgraph_out, set_size=set_size, multigenics=multigenics, fam_to_mod=fam2mod)
     logging.getLogger("PPanGGOLiN").info(f"Done drawing spot(s), they can be found in the directory: '{output}'")
 
 
diff --git a/ppanggolin/formats/writeSequences.py b/ppanggolin/formats/writeSequences.py
index 24d1ab76..f27d44ba 100644
--- a/ppanggolin/formats/writeSequences.py
+++ b/ppanggolin/formats/writeSequences.py
@@ -6,8 +6,9 @@
 import logging
 import re
 import subprocess
+import sys
 from pathlib import Path
-from typing import TextIO, Dict, Set, Iterable
+from typing import TextIO, Dict, Set, Iterable, Union
 import tempfile
 import shutil
 
@@ -134,7 +135,7 @@ def write_gene_sequences_from_annotations(genes_to_write: Iterable[Gene], file_o
     file_obj.flush()
 
 
-def create_mmseqs_db(sequences: TextIO, db_name: str, tmpdir: Path, db_mode: int = 0, db_type: int = 0) -> Path:
+def create_mmseqs_db(sequences: Iterable[Path], db_name: str, tmpdir: Path, db_mode: int = 0, db_type: int = 0) -> Path:
     """Create a MMseqs2 database from a sequences file.
 
     :param sequences: File with the sequences
@@ -149,15 +150,16 @@ def create_mmseqs_db(sequences: TextIO, db_name: str, tmpdir: Path, db_mode: int
     assert db_type in [0, 1, 2], f"dbtype must be 0, 1 or 2, given {db_type}"
 
     seq_nucdb = tmpdir / db_name
-    cmd = list(map(str, ["mmseqs", "createdb", "--createdb-mode", db_mode,
-                         "--dbtype", db_type, sequences.name, seq_nucdb]))
+    cmd = ["mmseqs", "createdb", "--createdb-mode", db_mode, "--dbtype", db_type]
+    cmd += [seq.absolute().as_posix() for seq in sequences] + [seq_nucdb]
+    cmd = list(map(str, cmd))
     logging.getLogger("PPanGGOLiN").debug(" ".join(cmd))
     logging.getLogger("PPanGGOLiN").info("Creating sequence database...")
     subprocess.run(cmd, stdout=subprocess.DEVNULL, check=True)
     return seq_nucdb
 
 
-def translate_genes(sequences: TextIO, db_name: str, tmpdir: Path, threads: int = 1,
+def translate_genes(sequences: Union[Path, Iterable[Path]], db_name: str, tmpdir: Path, threads: int = 1,
                     is_single_line_fasta: bool = False, code: int = 11) -> Path:
     """Translate nucleotide sequences into MMSeqs2 amino acid sequences database
 
@@ -170,8 +172,8 @@ def translate_genes(sequences: TextIO, db_name: str, tmpdir: Path, threads: int
 
     :return: Path to the MMSeqs2 database
     """
-    seq_nucdb = create_mmseqs_db(sequences, 'nucleotide_sequences_db', tmpdir,
-                                 db_mode=1 if is_single_line_fasta else 0, db_type=2)
+    seq_nucdb = create_mmseqs_db([sequences] if isinstance(sequences, Path) else sequences, f'{db_name}_nt_db',
+                                 tmpdir, db_mode=1 if is_single_line_fasta else 0, db_type=2)
     logging.getLogger("PPanGGOLiN").debug("Translate sequence ...")
     seqdb = tmpdir / db_name
     cmd = list(map(str, ["mmseqs", "translatenucs", seq_nucdb, seqdb, "--threads", threads, "--translation-table", code]))
@@ -223,14 +225,14 @@ def write_gene_sequences(pangenome: Pangenome, output: Path, genes: str, soft_co
     logging.getLogger("PPanGGOLiN").info(f"Done writing the gene sequences : '{outpath}'")
 
 
-def write_gene_protein_sequences(pangenome: Pangenome, output: Path, genes_prot: str, soft_core: float = 0.95,
+def write_gene_protein_sequences(pangenome: Pangenome, output: Path, proteins: str, soft_core: float = 0.95,
                                  compress: bool = False, keep_tmp: bool = False, tmp: Path = None,
                                  threads: int = 1, code: int = 11, disable_bar: bool = False):
     """ Write all amino acid sequences from given genes in pangenome
 
     :param pangenome: Pangenome object with gene families sequences
     :param output: Path to output directory
-    :param genes_prot: Selected partition of gene
+    :param proteins: Selected partition of gene
     :param soft_core: Soft core threshold to use
     :param compress: Compress the file in .gz
     :param keep_tmp: Keep temporary directory
@@ -242,11 +244,11 @@ def write_gene_protein_sequences(pangenome: Pangenome, output: Path, genes_prot:
     tmpdir = tmp / "translateGenes" if tmp is not None else Path(f"{tempfile.gettempdir()}/translateGenes")
     mk_outdir(tmpdir, True, True)
 
-    write_gene_sequences(pangenome, tmpdir, genes_prot, soft_core, compress, disable_bar)
+    write_gene_sequences(pangenome, tmpdir, proteins, soft_core, compress, disable_bar)
 
-    with open(tmpdir / f"{genes_prot}_genes.fna") as sequences:
-        translate_db = translate_genes(sequences, 'aa_db', tmpdir, threads, True, code)
-    outpath = output / f"{genes_prot}_protein_genes.fna"
+    pangenome_sequences = tmpdir / f"{proteins}_genes.fna"
+    translate_db = translate_genes(pangenome_sequences, 'aa_db', tmpdir, threads, True, code)
+    outpath = output / f"{proteins}_protein_genes.fna"
     cmd = list(map(str, ["mmseqs", "convert2fasta", translate_db, outpath]))
     logging.getLogger("PPanGGOLiN").debug(" ".join(cmd))
     subprocess.run(cmd, stdout=subprocess.DEVNULL, check=True)
@@ -527,7 +529,7 @@ def write_regions_sequences(pangenome: Pangenome, output: Path, regions: str, fa
 
 
 def write_sequence_files(pangenome: Pangenome, output: Path, fasta: Path = None, anno: Path = None,
-                         soft_core: float = 0.95, regions: str = None, genes: str = None, genes_prot: str = None,
+                         soft_core: float = 0.95, regions: str = None, genes: str = None, proteins: str = None,
                          gene_families: str = None, prot_families: str = None, compress: bool = False,
                          disable_bar: bool = False, **translate_kwgs):
     """
@@ -540,14 +542,14 @@ def write_sequence_files(pangenome: Pangenome, output: Path, fasta: Path = None,
     :param soft_core: Soft core threshold to use
     :param regions: Write the RGP nucleotide sequences
     :param genes: Write all nucleotide CDS sequences
-    :param genes_prot: Write amino acid CDS sequences.
+    :param proteins: Write amino acid CDS sequences.
     :param gene_families: Write representative nucleotide sequences of gene families.
     :param prot_families: Write representative amino acid sequences of gene families.
     :param compress: Compress the file in .gz
     :param disable_bar: Disable progress bar
     """
 
-    check_pangenome_to_write_sequences(pangenome, regions, genes, genes_prot, gene_families, prot_families, disable_bar)
+    check_pangenome_to_write_sequences(pangenome, regions, genes, proteins, gene_families, prot_families, disable_bar)
 
     if prot_families is not None:
         write_fasta_prot_fam(pangenome, output, prot_families, soft_core, compress, disable_bar)
@@ -555,9 +557,9 @@ def write_sequence_files(pangenome: Pangenome, output: Path, fasta: Path = None,
         write_fasta_gene_fam(pangenome, output, gene_families, soft_core, compress, disable_bar)
     if genes is not None:
         write_gene_sequences(pangenome, output, genes, soft_core, compress, disable_bar)
-    if genes_prot is not None:
-        write_gene_protein_sequences(pangenome, output, genes_prot, soft_core, compress,
-                                     disable_bar=disable_bar, **translate_kwgs)
+    if proteins is not None:
+        write_gene_protein_sequences(pangenome, output, proteins, soft_core, compress, disable_bar=disable_bar,
+                                     **translate_kwgs)
     if regions is not None:
         write_regions_sequences(pangenome, output, regions, fasta, anno, compress, disable_bar)
 
@@ -577,7 +579,7 @@ def launch(args: argparse.Namespace):
     pangenome = Pangenome()
     pangenome.add_file(args.pangenome)
     write_sequence_files(pangenome, args.output, fasta=args.fasta, anno=args.anno, soft_core=args.soft_core,
-                         regions=args.regions, genes=args.genes, genes_prot=args.genes_prot,
+                         regions=args.regions, genes=args.genes, proteins=args.proteins,
                          gene_families=args.gene_families, prot_families=args.prot_families, compress=args.compress,
                          disable_bar=args.disable_prog_bar, **translate_kwgs)
 
diff --git a/ppanggolin/projection/projection.py b/ppanggolin/projection/projection.py
index f9227d38..7cdbb8d3 100644
--- a/ppanggolin/projection/projection.py
+++ b/ppanggolin/projection/projection.py
@@ -538,7 +538,7 @@ def annotate_input_genes_with_pangenome_families(pangenome: Pangenome, input_org
     """
     seq_fasta_files = []
 
-    logging.getLogger('PPanGGOLiN').info('Writting gene sequences of input genomes.')
+    logging.getLogger('PPanGGOLiN').info('Writing gene sequences of input genomes.')
 
     for input_organism in input_organisms:
         seq_outdir = output / input_organism.name
@@ -557,14 +557,14 @@ def annotate_input_genes_with_pangenome_families(pangenome: Pangenome, input_org
         if use_representatives:
             _, seqid_to_gene_family = get_input_seq_to_family_with_rep(pangenome, seq_fasta_files, output=new_tmpdir,
                                                                        tmpdir=new_tmpdir, is_input_seq_nt=True,
-                                                                       cpu=cpu, no_defrag=no_defrag, identity=identity,
-                                                                       coverage=coverage,
+                                                                       is_input_slf=False, cpu=cpu, no_defrag=no_defrag,
+                                                                       identity=identity, coverage=coverage,
                                                                        translation_table=translation_table)
         else:
             _, seqid_to_gene_family = get_input_seq_to_family_with_all(pangenome=pangenome,
                                                                        sequence_files=seq_fasta_files,
                                                                        output=new_tmpdir, tmpdir=new_tmpdir,
-                                                                       is_input_seq_nt=True,
+                                                                       is_input_seq_nt=True, is_input_slf=False,
                                                                        cpu=cpu, no_defrag=no_defrag, identity=identity,
                                                                        coverage=coverage,
                                                                        translation_table=translation_table,

From e09e5f7f68df5c5bff9609a47e755503f3a9a9a4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= <arnoux.jeromepj@gmail.com>
Date: Thu, 6 Jun 2024 15:51:34 +0200
Subject: [PATCH 16/25] Improve documentation on write fasta

---
 docs/user/writeFasta.md | 41 ++++++++++++++++++++++++++++++++---------
 1 file changed, 32 insertions(+), 9 deletions(-)

diff --git a/docs/user/writeFasta.md b/docs/user/writeFasta.md
index 843f0788..7bae4712 100644
--- a/docs/user/writeFasta.md
+++ b/docs/user/writeFasta.md
@@ -20,7 +20,8 @@ When using the `softcore` filter, the `--soft_core` option can be used to modify
 
 ### Nucleotide sequences
 
-This option can be used to write the nucleotide CDS sequences. It can be used as such, to write all the genes of the pangenome for example:
+With the `--genes partition` option PPanGGOLiN will write the nucleotide CDS sequences for the given partition.
+It can be used as such, to write all the genes of the pangenome for example:
 
 ```bash
 ppanggolin fasta -p pangenome.h5 --output MY_GENES --genes all
@@ -34,7 +35,8 @@ ppanggolin fasta -p pangenome.h5 --output MY_GENES --genes persistent
 
 ### Protein sequences
 
-This option can be used to write the amino acid CDS sequences. It can be used as such, to write all the genes of the pangenome for example:
+With the `--proteins partition` option PPanGGOLiN will write the nucleotide CDS sequences for the given partition. 
+It can be used as such, to write all the genes of the pangenome for example:
 
 ```bash
 ppanggolin fasta -p pangenome.h5 --output MY_GENES --proteins all
@@ -46,19 +48,23 @@ Or to write only the cloud genes:
 ppanggolin fasta -p pangenome.h5 --output MY_GENES --genes_prot cloud
 ```
 
-To translate the genes sequences, PPanGGOLiN use [MMSeqs2](https://github.com/soedinglab/MMseqs2) `translatenucs` command. So for this option you can give multiple threads with `--threads`. You can also specify the translation table to use with `--translate_table`. Finally, you can keep the [MMSeqs2](https://github.com/soedinglab/MMseqs2) that are generated in the temporary directory (that you can also be specified with `--tmpdir`) by indicate the option `--keep_tmp`.
+To translate the gene sequences, PPanGGOLiN uses the [MMSeqs2](https://github.com/soedinglab/MMseqs2) `translatenucs` command. 
+So for this option you can specify multiple threads with `--cpu`. 
+You can also specify the translation table to use with `--translate_table`. 
+Finally, you can keep the temporary directory -that you can specify with `--tmpdir`- with the [MMSeqs2](https://github.com/soedinglab/MMseqs2) database using the `--keep_tmp` option.
 
 ## Gene families
 
 ### Protein sequences
 
-This option can be used to write the protein sequences of the representative sequences for each family. It can be used as such for all families:
+With the `--prot_families partition` option PPanGGOLiN will write the protein sequences of the representative gene for each family for the given partition. 
+It can be used as such for all families:
 
 ```bash
 ppanggolin fasta -p pangenome.h5 --output MY_PROT --prot_families all
 ```
 
-or for all the shell families for example:
+Or for all the shell families for example:
 
 ```bash
 ppanggolin fasta -p pangenome.h5 --output MY_PROT --prot_families shell
@@ -66,16 +72,33 @@ ppanggolin fasta -p pangenome.h5 --output MY_PROT --prot_families shell
 
 ### Nucleotide sequences
 
-This option can be used to write the gene sequences of the representative sequences for each family. It can be used as such:
+With the `--gene_families partition` option PPanGGOLiN will write the nucleotide sequences of the representative gene for each family for the given partition. 
+It can be used as such for all families:
 
 ```bash
 ppanggolin fasta -p pangenome.h5 --output MY_GENES_FAMILIES --gene_families all
 ```
 
-or for the cloud families for example:
+Or for the core families for example:
 
 ```bash
-ppanggolin fasta -p pangenome.h5 --output MY_GENES_FAMILIES --gene_families cloud
+ppanggolin fasta -p pangenome.h5 --output MY_GENES_FAMILIES --gene_families core
+```
+
+
+## Modules
+All the precedent command admit a module as partition.
+
+So you can write the protein sequences for the family in module_X as such:  
+
+```bash
+ppanggolin fasta -p pangenome.h5 --output MY_REGIONS --prot_families module_X
+```
+
+Or the nucleotide sequence of all genes in module_X:
+
+```bash
+ppanggolin fasta -p pangenome.h5 --output MY_REGIONS --genes module_X
 ```
 
 ## Regions
@@ -91,4 +114,4 @@ It can be used as such:
 
 ```bash
 ppanggolin fasta -p pangenome.h5 --output MY_REGIONS --regions all --fasta genomes.fasta.list
-```
+```
\ No newline at end of file

From 9c8fb76c8565092f08623de2664b95f736843547 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= <arnoux.jeromepj@gmail.com>
Date: Fri, 7 Jun 2024 09:45:20 +0200
Subject: [PATCH 17/25] Fix createdb mode

---
 .github/workflows/main.yml              |  2 +-
 ppanggolin/align/alignOnPang.py         | 38 ++++++++++++-------------
 ppanggolin/cluster/cluster.py           |  3 +-
 ppanggolin/context/searchGeneContext.py |  2 +-
 ppanggolin/formats/writeSequences.py    | 19 +++++++------
 ppanggolin/projection/projection.py     | 10 ++++---
 pyproject.toml                          |  6 ++++
 7 files changed, 45 insertions(+), 35 deletions(-)

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index 94e5f44f..2abe5b7f 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -100,7 +100,7 @@ jobs:
         ppanggolin fasta -p stepbystep/pangenome.h5 --output stepbystep -f --prot_families module_0
         ppanggolin fasta -p stepbystep/pangenome.h5 --output stepbystep -f --prot_families core
         ppanggolin fasta -p stepbystep/pangenome.h5 --output stepbystep -f --gene_families module_0 --genes module_0
-        ppanggolin fasta -p stepbystep/pangenome.h5 --output stepbystep -f --proteins cloud --threads 1 --keep_tmp 
+        ppanggolin fasta -p stepbystep/pangenome.h5 --output stepbystep -f --proteins cloud --cpu $NUM_CPUS --keep_tmp 
 
         ppanggolin draw -p stepbystep/pangenome.h5 --draw_spots --spots all -o stepbystep -f
         ppanggolin metrics -p stepbystep/pangenome.h5 --genome_fluidity --no_print_info --recompute_metrics --log metrics.log
diff --git a/ppanggolin/align/alignOnPang.py b/ppanggolin/align/alignOnPang.py
index 0742a876..0069b798 100644
--- a/ppanggolin/align/alignOnPang.py
+++ b/ppanggolin/align/alignOnPang.py
@@ -74,8 +74,7 @@ def align_seq_to_pang(target_seq_file: Union[Path, Iterable[Path]], query_seq_fi
     with tempfile.NamedTemporaryFile(mode="w", dir=tmpdir.as_posix(), prefix="aln_result_db_file", suffix=".aln.DB",
                                      delete=False) as aln_db:
         cmd = ["mmseqs", "search", query_db.as_posix(), target_db.as_posix(), aln_db.name, tmpdir.as_posix(), "-a",
-               "--min-seq-id", str(identity),
-               "-c", str(coverage), "--cov-mode", cov_mode, "--threads", str(cpu),
+               "--min-seq-id", str(identity), "-c", str(coverage), "--cov-mode", cov_mode, "--threads", str(cpu),
                "--seed-sub-mat", "VTML40.out", "-s", "2", '--comp-bias-corr', "0", "--mask", "0", "-e", "1"]
 
         logging.getLogger("PPanGGOLiN").info("Aligning sequences")
@@ -180,18 +179,19 @@ def get_seq_ids(seq_file: TextIOWrapper) -> Tuple[Set[str], bool, bool]:
     seq_set = set()
     seq_count = 0
     first_seq_concat = ""
-    single_line_fasta = False
+    single_line_fasta = True
     count_fasta_line = 0
     for line in seq_file:
         if line.startswith(">"):
             seq_set.add(line[1:].split()[0].strip())
             seq_count += 1
             if count_fasta_line > 1:  # Allow to know if we can use soft link with createdb from MMSeqs2
-                single_line_fasta = True
+                single_line_fasta = False
             count_fasta_line = 0
-        elif seq_count <= 20:
-            first_seq_concat += line.strip()
-        count_fasta_line += 1
+        else:
+            count_fasta_line += 1
+            if seq_count <= 20:
+                first_seq_concat += line.strip()
 
     char_counter = Counter(first_seq_concat)
     is_nucleotide = all(char in dna_expected_char for char in char_counter)
@@ -354,18 +354,18 @@ def get_seq_info(seq_to_pang: dict, pangenome: Pangenome, output: Path, draw_rel
     logging.getLogger("PPanGGOLiN").info("Writing RGP and spot information related to hits in the pangenome")
     multigenics = pangenome.get_multigenics(pangenome.parameters["rgp"]["dup_margin"])
 
-    finfo = open(output / "info_input_seq.tsv", "w")
-    finfo.write("input\tfamily\tpartition\tspot_list_as_member\tspot_list_as_border\trgp_list\n")
-    fam2rgp = get_fam_to_rgp(pangenome, multigenics)
-    fam2spot, fam2border = get_fam_to_spot(pangenome, multigenics)
-    spot_list = set()
-    for seq, panfam in seq_to_pang.items():
-        finfo.write(seq + '\t' + panfam.name + "\t" + panfam.named_partition + "\t" + ",".join(
-            map(str, fam2spot[panfam])) + "\t" + ",".join(
-            map(str, fam2border[panfam])) + "\t" + ','.join(fam2rgp[panfam]) + "\n")
-        spot_list |= set(fam2spot[panfam])
-        spot_list |= set(fam2border[panfam])
-    finfo.close()
+    with open(output / "info_input_seq.tsv", "w") as finfo:
+        finfo.write("input\tfamily\tpartition\tspot_list_as_member\tspot_list_as_border\trgp_list\n")
+        fam2rgp = get_fam_to_rgp(pangenome, multigenics)
+        fam2spot, fam2border = get_fam_to_spot(pangenome, multigenics)
+        spot_list = set()
+        for seq, panfam in seq_to_pang.items():
+            finfo.write(seq + '\t' + panfam.name + "\t" + panfam.named_partition + "\t" + ",".join(
+                map(str, fam2spot[panfam])) + "\t" + ",".join(
+                map(str, fam2border[panfam])) + "\t" + ','.join(fam2rgp[panfam]) + "\n")
+            spot_list |= set(fam2spot[panfam])
+            spot_list |= set(fam2border[panfam])
+
     if draw_related:
         drawn_spots = set()
         for spot in spot_list:
diff --git a/ppanggolin/cluster/cluster.py b/ppanggolin/cluster/cluster.py
index 639c578e..c7d3d1f3 100644
--- a/ppanggolin/cluster/cluster.py
+++ b/ppanggolin/cluster/cluster.py
@@ -85,7 +85,8 @@ def first_clustering(sequences: Path, tmpdir: Path, cpu: int = 1, code: int = 11
 
     :return: path to representative sequence file and path to tsv clustering result
     """
-    seqdb = translate_genes(sequences, 'aa_db', tmpdir, cpu, True, code)
+    seqdb = translate_genes(sequences=sequences, db_name='aa_db', tmpdir=tmpdir, cpu=cpu,
+                            is_single_line_fasta=True, code=code)
     logging.getLogger("PPanGGOLiN").info("Clustering sequences...")
     cludb = tmpdir / 'cluster_db'
     cmd = list(map(str, ["mmseqs", "cluster", seqdb, cludb, tmpdir, "--cluster-mode", mode, "--min-seq-id",
diff --git a/ppanggolin/context/searchGeneContext.py b/ppanggolin/context/searchGeneContext.py
index 0312e4fa..508203fc 100644
--- a/ppanggolin/context/searchGeneContext.py
+++ b/ppanggolin/context/searchGeneContext.py
@@ -650,7 +650,7 @@ def parser_context(parser: argparse.ArgumentParser):
                           default='graphml', choices=['gexf', 'graphml'])
     optional.add_argument("-c", "--cpu", required=False, default=1, type=int,
                           help="Number of available cpus")
-    optional.add_argument("--tmpdir", required=False, type=str, default=Path(tempfile.gettempdir()),
+    optional.add_argument("--tmpdir", required=False, type=Path, default=Path(tempfile.gettempdir()),
                           help="directory for storing temporary files")
     optional.add_argument("--keep_tmp", required=False, default=False, action="store_true",
                           help="Keeping temporary files (useful for debugging).")
diff --git a/ppanggolin/formats/writeSequences.py b/ppanggolin/formats/writeSequences.py
index f27d44ba..bc88ec20 100644
--- a/ppanggolin/formats/writeSequences.py
+++ b/ppanggolin/formats/writeSequences.py
@@ -151,7 +151,7 @@ def create_mmseqs_db(sequences: Iterable[Path], db_name: str, tmpdir: Path, db_m
 
     seq_nucdb = tmpdir / db_name
     cmd = ["mmseqs", "createdb", "--createdb-mode", db_mode, "--dbtype", db_type]
-    cmd += [seq.absolute().as_posix() for seq in sequences] + [seq_nucdb]
+    cmd += [seq.absolute().as_posix() for seq in sequences] + [seq_nucdb.absolute()]
     cmd = list(map(str, cmd))
     logging.getLogger("PPanGGOLiN").debug(" ".join(cmd))
     logging.getLogger("PPanGGOLiN").info("Creating sequence database...")
@@ -159,14 +159,14 @@ def create_mmseqs_db(sequences: Iterable[Path], db_name: str, tmpdir: Path, db_m
     return seq_nucdb
 
 
-def translate_genes(sequences: Union[Path, Iterable[Path]], db_name: str, tmpdir: Path, threads: int = 1,
+def translate_genes(sequences: Union[Path, Iterable[Path]], db_name: str, tmpdir: Path, cpu: int = 1,
                     is_single_line_fasta: bool = False, code: int = 11) -> Path:
     """Translate nucleotide sequences into MMSeqs2 amino acid sequences database
 
     :param sequences: File with the nucleotide sequences
     :param db_name: name of the output database
     :param tmpdir: Temporary directory to save the MMSeqs2 files
-    :param threads: Number of available threads to use
+    :param cpu: Number of available threads to use
     :param is_single_line_fasta: Allow to use soft link in MMSeqs2 database
     :param code: Translation code to use
 
@@ -176,7 +176,7 @@ def translate_genes(sequences: Union[Path, Iterable[Path]], db_name: str, tmpdir
                                  tmpdir, db_mode=1 if is_single_line_fasta else 0, db_type=2)
     logging.getLogger("PPanGGOLiN").debug("Translate sequence ...")
     seqdb = tmpdir / db_name
-    cmd = list(map(str, ["mmseqs", "translatenucs", seq_nucdb, seqdb, "--threads", threads, "--translation-table", code]))
+    cmd = list(map(str, ["mmseqs", "translatenucs", seq_nucdb, seqdb, "--threads", cpu, "--translation-table", code]))
     logging.getLogger("PPanGGOLiN").debug(" ".join(cmd))
     subprocess.run(cmd, stdout=subprocess.DEVNULL, check=True)
     return seqdb
@@ -227,7 +227,7 @@ def write_gene_sequences(pangenome: Pangenome, output: Path, genes: str, soft_co
 
 def write_gene_protein_sequences(pangenome: Pangenome, output: Path, proteins: str, soft_core: float = 0.95,
                                  compress: bool = False, keep_tmp: bool = False, tmp: Path = None,
-                                 threads: int = 1, code: int = 11, disable_bar: bool = False):
+                                 cpu: int = 1, code: int = 11, disable_bar: bool = False):
     """ Write all amino acid sequences from given genes in pangenome
 
     :param pangenome: Pangenome object with gene families sequences
@@ -237,7 +237,7 @@ def write_gene_protein_sequences(pangenome: Pangenome, output: Path, proteins: s
     :param compress: Compress the file in .gz
     :param keep_tmp: Keep temporary directory
     :param tmp: Path to temporary directory
-    :param threads: Number of threads available
+    :param cpu: Number of threads available
     :param code: Genetic code use to translate nucleotide sequences to protein sequences
     :param disable_bar: Disable progress bar
     """
@@ -247,7 +247,8 @@ def write_gene_protein_sequences(pangenome: Pangenome, output: Path, proteins: s
     write_gene_sequences(pangenome, tmpdir, proteins, soft_core, compress, disable_bar)
 
     pangenome_sequences = tmpdir / f"{proteins}_genes.fna"
-    translate_db = translate_genes(pangenome_sequences, 'aa_db', tmpdir, threads, True, code)
+    translate_db = translate_genes(sequences=pangenome_sequences, db_name='aa_db', tmpdir=tmpdir,
+                                   cpu=cpu, is_single_line_fasta=True, code=code)
     outpath = output / f"{proteins}_protein_genes.fna"
     cmd = list(map(str, ["mmseqs", "convert2fasta", translate_db, outpath]))
     logging.getLogger("PPanGGOLiN").debug(" ".join(cmd))
@@ -572,7 +573,7 @@ def launch(args: argparse.Namespace):
     """
     check_write_sequences_args(args)
     translate_kwgs = {"code": args.translation_table,
-                      "threads": args.threads,
+                      "cpu": args.cpu,
                       "tmp": args.tmpdir,
                       "keep_tmp": args.keep_tmp}
     mk_outdir(args.output, args.force)
@@ -661,7 +662,7 @@ def parser_seq(parser: argparse.ArgumentParser):
     optional.add_argument("--compress", required=False, action="store_true", help="Compress the files in .gz")
     optional.add_argument("--translation_table", required=False, default="11",
                           help="Translation table (genetic code) to use.")
-    optional.add_argument("--threads", required=False, default=1, type=int, help="Number of available threads")
+    optional.add_argument("--cpu", required=False, default=1, type=int, help="Number of available threads")
     optional.add_argument("--tmpdir", required=False, type=Path, default=Path(tempfile.gettempdir()),
                           help="directory for storing temporary files")
     optional.add_argument("--keep_tmp", required=False, default=False, action="store_true",
diff --git a/ppanggolin/projection/projection.py b/ppanggolin/projection/projection.py
index 7cdbb8d3..cddfd338 100644
--- a/ppanggolin/projection/projection.py
+++ b/ppanggolin/projection/projection.py
@@ -555,16 +555,18 @@ def annotate_input_genes_with_pangenome_families(pangenome: Pangenome, input_org
     with create_tmpdir(main_dir=tmpdir, basename="align_input_seq_tmpdir", keep_tmp=keep_tmp) as new_tmpdir:
 
         if use_representatives:
-            _, seqid_to_gene_family = get_input_seq_to_family_with_rep(pangenome, seq_fasta_files, output=new_tmpdir,
-                                                                       tmpdir=new_tmpdir, is_input_seq_nt=True,
-                                                                       is_input_slf=False, cpu=cpu, no_defrag=no_defrag,
+            _, seqid_to_gene_family = get_input_seq_to_family_with_rep(pangenome=pangenome,
+                                                                       sequence_files=seq_fasta_files,
+                                                                       output=new_tmpdir, tmpdir=new_tmpdir,
+                                                                       is_input_seq_nt=True, is_input_slf=True,
+                                                                       cpu=cpu, no_defrag=no_defrag,
                                                                        identity=identity, coverage=coverage,
                                                                        translation_table=translation_table)
         else:
             _, seqid_to_gene_family = get_input_seq_to_family_with_all(pangenome=pangenome,
                                                                        sequence_files=seq_fasta_files,
                                                                        output=new_tmpdir, tmpdir=new_tmpdir,
-                                                                       is_input_seq_nt=True, is_input_slf=False,
+                                                                       is_input_seq_nt=True, is_input_slf=True,
                                                                        cpu=cpu, no_defrag=no_defrag, identity=identity,
                                                                        coverage=coverage,
                                                                        translation_table=translation_table,
diff --git a/pyproject.toml b/pyproject.toml
index 7864f533..abdbf2d7 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -81,3 +81,9 @@ packages = ["ppanggolin"]
 
 [tool.setuptools.dynamic]
 version = {file = "VERSION"}
+
+[tool.pytest.ini_options]
+markers = [
+    "nucleotides: marks tests to run with nucleotides alphabets",
+    "aminoacids: marks tests to run with amino-acids alphabets",
+]
\ No newline at end of file

From 56e4318dbe33dc6ab92e6f518b1ce5b7b11ab9f5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= <arnoux.jeromepj@gmail.com>
Date: Fri, 7 Jun 2024 10:28:47 +0200
Subject: [PATCH 18/25] Improve documentation on write fasta

---
 docs/user/writeFasta.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/docs/user/writeFasta.md b/docs/user/writeFasta.md
index 7bae4712..10725d5b 100644
--- a/docs/user/writeFasta.md
+++ b/docs/user/writeFasta.md
@@ -49,9 +49,9 @@ ppanggolin fasta -p pangenome.h5 --output MY_GENES --genes_prot cloud
 ```
 
 To translate the gene sequences, PPanGGOLiN uses the [MMSeqs2](https://github.com/soedinglab/MMseqs2) `translatenucs` command. 
-So for this option you can specify multiple threads with `--cpu`. 
-You can also specify the translation table to use with `--translate_table`. 
-Finally, you can keep the temporary directory -that you can specify with `--tmpdir`- with the [MMSeqs2](https://github.com/soedinglab/MMseqs2) database using the `--keep_tmp` option.
+So for this option you can specify multiple threads with `--cpu`.
+You can also specify the translation table to use with `--translate_table`.
+The temporary directory, can be specified with `--tmpdir` to store the [MMSeqs2](https://github.com/soedinglab/MMseqs2) database and other files. Temporary files will be deleted at the end of the execution. To keep them, you can use the `--keep_tmp` option.
 
 ## Gene families
 

From 13bd4e0aed2ca17ce5655ebb0a1ac1854da621d4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= <arnoux.jeromepj@gmail.com>
Date: Fri, 7 Jun 2024 15:02:58 +0200
Subject: [PATCH 19/25] Add test unit for align

---
 tests/align/test_align.py | 111 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 111 insertions(+)
 create mode 100644 tests/align/test_align.py

diff --git a/tests/align/test_align.py b/tests/align/test_align.py
new file mode 100644
index 00000000..e5ce2dde
--- /dev/null
+++ b/tests/align/test_align.py
@@ -0,0 +1,111 @@
+import pytest
+from typing import List
+from random import choice, randint
+
+from ppanggolin.align.alignOnPang import get_seq_ids
+
+
+@pytest.fixture
+def nucleotides():
+    yield ['A', 'C', 'G', 'T', 'N']
+
+@pytest.fixture
+def aminoacids():
+    amino_acid_alphabet = [
+        'A',  # Alanine
+        'R',  # Arginine
+        'N',  # Asparagine
+        'D',  # Aspartic acid
+        'C',  # Cysteine
+        'E',  # Glutamic acid
+        'Q',  # Glutamine
+        'G',  # Glycine
+        'H',  # Histidine
+        'I',  # Isoleucine
+        'L',  # Leucine
+        'K',  # Lysine
+        'M',  # Methionine
+        'F',  # Phenylalanine
+        'P',  # Proline
+        'S',  # Serine
+        'T',  # Threonine
+        'W',  # Tryptophan
+        'Y',  # Tyrosine
+        'V'  # Valine
+    ]
+    yield amino_acid_alphabet
+
+@pytest.fixture
+def number_of_sequences():
+    yield randint(4, 10)
+@pytest.fixture
+def single_line_fasta(request, tmp_path_factory: pytest.TempPathFactory, number_of_sequences,
+                      aminoacids: List[str], nucleotides: List[str]):
+    if request.node.get_closest_marker('aminoacids'):
+        alphabet = aminoacids
+        fasta_path = tmp_path_factory.getbasetemp() / "single_line_nt.fasta"
+    else:
+        alphabet = nucleotides
+        fasta_path = tmp_path_factory.getbasetemp() / "single_line_aa.fasta"
+    with open(fasta_path, "w") as fasta:
+        for i in range(number_of_sequences):
+            fasta.write(f">Gene_{i}\n")
+            fasta.write("".join([choice(alphabet) for _ in range(0, randint(30, 100))]))
+            fasta.write("\n")
+    yield fasta_path
+
+@pytest.fixture
+def multi_line_fasta(request, tmp_path_factory: pytest.TempPathFactory, number_of_sequences,
+                     aminoacids: List[str], nucleotides: List[str]):
+    if request.node.get_closest_marker('aminoacids'):
+        alphabet = aminoacids
+        fasta_path = tmp_path_factory.getbasetemp() / "single_line_nt.fasta"
+    else:
+        alphabet = nucleotides
+        fasta_path = tmp_path_factory.getbasetemp() / "single_line_aa.fasta"
+    with open(fasta_path, "w") as fasta:
+        for i in range(number_of_sequences):
+            fasta.write(f">Gene_{i}\n")
+            for j in range(randint(4,10)):
+                fasta.write("".join([choice(alphabet) for _ in range(60)]))
+                fasta.write("\n")
+    yield fasta_path
+
+@pytest.mark.nucleotides
+def test_get_seq_ids_single_line_nt(number_of_sequences, single_line_fasta):
+    with open(single_line_fasta, "r") as fasta:
+        seq_set, is_nucleotide, single_line_fasta = get_seq_ids(fasta)
+        assert len(seq_set) == number_of_sequences
+        assert seq_set == {f"Gene_{i}" for i in range(number_of_sequences)}
+        assert is_nucleotide
+        assert single_line_fasta
+
+@pytest.mark.aminoacids
+def test_get_seq_ids_single_line_aa(number_of_sequences, single_line_fasta):
+    with open(single_line_fasta, "r") as fasta:
+        seq_set, is_nucleotide, single_line_fasta = get_seq_ids(fasta)
+        assert len(seq_set) == number_of_sequences
+        assert seq_set == {f"Gene_{i}" for i in range(number_of_sequences)}
+        assert not is_nucleotide
+        assert single_line_fasta
+
+@pytest.mark.nucleotides
+def test_get_seq_ids_multi_line_nt(number_of_sequences, multi_line_fasta):
+    with open(multi_line_fasta, "r") as fasta:
+        seq_set, is_nucleotide, single_line_fasta = get_seq_ids(fasta)
+        assert len(seq_set) == number_of_sequences
+        assert seq_set == {f"Gene_{i}" for i in range(number_of_sequences)}
+        assert is_nucleotide
+        assert not single_line_fasta
+
+@pytest.mark.aminoacids
+def test_get_seq_ids_multi_line_aa(number_of_sequences, multi_line_fasta):
+    with open(multi_line_fasta, "r") as fasta:
+        seq_set, is_nucleotide, single_line_fasta = get_seq_ids(fasta)
+        assert len(seq_set) == number_of_sequences
+        assert seq_set == {f"Gene_{i}" for i in range(number_of_sequences)}
+        assert not is_nucleotide
+        assert not single_line_fasta
+
+
+

From b5b0e4dcfe2eecd7e7d1c9f2ea9e13fe0562a2eb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= <arnoux.jeromepj@gmail.com>
Date: Fri, 7 Jun 2024 15:28:13 +0200
Subject: [PATCH 20/25] Use context manager to write sequences

---
 ppanggolin/align/alignOnPang.py      |  74 ++++---
 ppanggolin/cluster/cluster.py        |  60 +++---
 ppanggolin/formats/readBinaries.py   |  79 ++++----
 ppanggolin/formats/writeSequences.py | 107 +++++-----
 ppanggolin/projection/projection.py  | 291 ++++++++++++++-------------
 5 files changed, 305 insertions(+), 306 deletions(-)

diff --git a/ppanggolin/align/alignOnPang.py b/ppanggolin/align/alignOnPang.py
index 0069b798..2b5547d3 100644
--- a/ppanggolin/align/alignOnPang.py
+++ b/ppanggolin/align/alignOnPang.py
@@ -9,7 +9,7 @@
 import subprocess
 import argparse
 from collections import defaultdict, Counter
-from typing import List, Tuple, Set, Dict, TextIO, Union, Iterable, Any
+from typing import List, Tuple, Set, Dict, Union, Iterable, Any
 from pathlib import Path
 
 from tqdm import tqdm
@@ -25,8 +25,8 @@
 from ppanggolin.formats.writeSequences import translate_genes, create_mmseqs_db
 
 
-def align_seq_to_pang(target_seq_file: Union[Path, Iterable[Path]], query_seq_files: Union[Path, Iterable[Path]], tmpdir: Path, cpu: int = 1,
-                      no_defrag: bool = False, identity: float = 0.8, coverage: float = 0.8,
+def align_seq_to_pang(target_seq_file: Union[Path, Iterable[Path]], query_seq_files: Union[Path, Iterable[Path]],
+                      tmpdir: Path, cpu: int = 1, no_defrag: bool = False, identity: float = 0.8, coverage: float = 0.8,
                       is_query_nt: bool = False, is_query_slf: bool = False, is_target_nt: bool = False,
                       is_target_slf: bool = False, translation_table: int = None) -> Path:
     """
@@ -51,7 +51,9 @@ def align_seq_to_pang(target_seq_file: Union[Path, Iterable[Path]], query_seq_fi
     if is_target_nt:
         logging.getLogger("PPanGGOLiN").debug("Target sequences will be translated by mmseqs with "
                                               f"translation table {translation_table}")
-        target_db = translate_genes(target_seq_file, 'target_db', tmpdir, cpu, is_target_slf, translation_table)
+        with create_tmpdir(tmpdir, basename="target_db", keep_tmp=True) as target_db_dir:
+            #  Keep is set as true because whether tmpdir is deleted or not target_db_dir will be the same
+            target_db = translate_genes(target_seq_file, target_db_dir, cpu, is_target_slf, translation_table)
     else:
         target_db = create_mmseqs_db([target_seq_file] if isinstance(target_seq_file, Path) else target_seq_file,
                                      'target_db', tmpdir, db_mode=1 if is_target_slf else 0, db_type=1)
@@ -59,7 +61,9 @@ def align_seq_to_pang(target_seq_file: Union[Path, Iterable[Path]], query_seq_fi
     if is_query_nt:
         logging.getLogger("PPanGGOLiN").debug("Query sequences will be translated by mmseqs "
                                               f"with translation table {translation_table}")
-        query_db = translate_genes(query_seq_files, 'query_db', tmpdir, cpu, is_query_slf, translation_table)
+        with create_tmpdir(tmpdir, basename="query_db", keep_tmp=True) as query_db_dir:
+            #  Keep is set as true because whether tmpdir is deleted or not target_db_dir will be the same
+            query_db = translate_genes(query_seq_files, query_db_dir, cpu, is_query_slf, translation_table)
     else:
         query_db = create_mmseqs_db([query_seq_files] if isinstance(query_seq_files, Path) else query_seq_files,
                                     'query_db', tmpdir, db_mode=1 if is_query_slf else 0, db_type=1)
@@ -199,35 +203,34 @@ def get_seq_ids(seq_file: TextIOWrapper) -> Tuple[Set[str], bool, bool]:
     return seq_set, is_nucleotide, single_line_fasta
 
 
-def write_gene_fam_sequences(pangenome: Pangenome, file_obj: TextIO, add: str = "", disable_bar: bool = False):
+def write_gene_fam_sequences(pangenome: Pangenome, output: Path, add: str = "", disable_bar: bool = False):
     """
     Export the sequence of gene families
 
     :param pangenome: Pangenome containing families
-    :param file_obj: Temporary file where sequences will be written
+    :param output: Path to file where sequences will be written
     :param add: Add prefix to sequence name
     :param disable_bar: disable progress bar
     """
-    for fam in tqdm(pangenome.gene_families, unit="families", disable=disable_bar,
-                    total=pangenome.number_of_gene_families):
-        file_obj.write(">" + add + fam.name + "\n")
-        file_obj.write(fam.sequence + "\n")
-    # file_obj.flush()
+    with open(output, "w") as file_obj:
+        for fam in tqdm(pangenome.gene_families, unit="families", disable=disable_bar,
+                        total=pangenome.number_of_gene_families):
+            file_obj.write(">" + add + fam.name + "\n")
+            file_obj.write(fam.sequence + "\n")
 
 
-def write_all_gene_sequences(pangenome: Pangenome, file_obj: TextIO, add: str = "", disable_bar: bool = False):
+def write_all_gene_sequences(pangenome: Pangenome, output: Path, add: str = "", disable_bar: bool = False):
     """
     Export the sequence of pangenome genes
 
     :param pangenome: Pangenome containing genes
-    :param file_obj: Temporary file where sequences will be written
+    :param output: Path to file where sequences will be written
     :param add: Add prefix to sequence name
     :param disable_bar: disable progress bar
-
     """
 
     if pangenome.status["geneSequences"] == "inFile":
-        get_non_redundant_gene_sequences_from_file(pangenome.file, file_obj, add=add, disable_bar=disable_bar)
+        get_non_redundant_gene_sequences_from_file(pangenome.file, output, add=add, disable_bar=disable_bar)
     else:
         # this should never happen if the pangenome has been properly checked before launching this function.
         raise Exception("The pangenome does not include gene sequences")
@@ -417,20 +420,16 @@ def get_input_seq_to_family_with_rep(pangenome: Pangenome, sequence_files: Union
              and a dictionary mapping input sequences to gene families.
 
     """
-    # delete False to be able to keep tmp file. If they are not keep tmpdir will be destroyed so no need to delete tmpfile
-    with tempfile.NamedTemporaryFile(mode="w", dir=tmpdir.as_posix(), delete=False,
-                                     prefix="representative_genes", suffix=".faa") as tmp_pang_file:
-        logging.getLogger("PPanGGOLiN").debug(f'Write gene family sequences in {tmp_pang_file.name}')
-        pangenome_sequences = Path(tmp_pang_file.name)
-        write_gene_fam_sequences(pangenome, tmp_pang_file, add="ppanggolin_", disable_bar=disable_bar)
+    pangenome_sequences = tmpdir / "proteins_families.faa"
+    logging.getLogger("PPanGGOLiN").debug(f'Write gene family sequences in {pangenome_sequences.absolute()}')
+    write_gene_fam_sequences(pangenome, pangenome_sequences, add="ppanggolin_", disable_bar=disable_bar)
 
-        align_file = align_seq_to_pang(target_seq_file=pangenome_sequences, query_seq_files=sequence_files,
-                                       tmpdir=tmpdir, cpu=cpu,
-                                       no_defrag=no_defrag, identity=identity, coverage=coverage,
-                                       is_query_nt=is_input_seq_nt, is_query_slf=is_input_slf,
-                                       is_target_nt=False, is_target_slf=False, translation_table=translation_table)
+    align_file = align_seq_to_pang(target_seq_file=pangenome_sequences, query_seq_files=sequence_files,
+                                   tmpdir=tmpdir, cpu=cpu, no_defrag=no_defrag, identity=identity, coverage=coverage,
+                                   is_query_nt=is_input_seq_nt, is_query_slf=is_input_slf,
+                                   is_target_nt=False, is_target_slf=True, translation_table=translation_table)
 
-        seq2pang, align_file = map_input_gene_to_family_rep_aln(align_file, output, pangenome)
+    seq2pang, align_file = map_input_gene_to_family_rep_aln(align_file, output, pangenome)
 
     return align_file, seq2pang
 
@@ -462,19 +461,16 @@ def get_input_seq_to_family_with_all(pangenome: Pangenome, sequence_files:  Unio
     :return: A tuple containing the path to the alignment result file, 
              and a dictionary mapping input sequences to gene families.
     """
+    pangenome_sequences = tmpdir / "nucleotide_genes.fna"
+    logging.getLogger("PPanGGOLiN").debug(f'Write all pangenome gene sequences in {pangenome_sequences.absolute()}')
+    write_all_gene_sequences(pangenome, pangenome_sequences, add="ppanggolin_", disable_bar=disable_bar)
 
-    with tempfile.NamedTemporaryFile(mode="w", dir=tmpdir.as_posix(), delete=False,
-                                     prefix="all_pangenome_genes", suffix=".fna") as tmp_pang_file:
-        logging.getLogger("PPanGGOLiN").debug(f'Write all pangenome gene sequences in {tmp_pang_file.name}')
-        pangenome_sequences = Path(tmp_pang_file.name)
-        write_all_gene_sequences(pangenome, tmp_pang_file, add="ppanggolin_", disable_bar=disable_bar)
-
-        align_file = align_seq_to_pang(target_seq_file=pangenome_sequences, query_seq_files=sequence_files, tmpdir=tmpdir,
-                                       cpu=cpu, no_defrag=no_defrag, identity=identity, coverage=coverage,
-                                       is_query_nt=is_input_seq_nt, is_query_slf=is_input_slf, is_target_nt=True,
-                                       is_target_slf=True, translation_table=translation_table)
+    align_file = align_seq_to_pang(target_seq_file=pangenome_sequences, query_seq_files=sequence_files, tmpdir=tmpdir,
+                                   cpu=cpu, no_defrag=no_defrag, identity=identity, coverage=coverage,
+                                   is_query_nt=is_input_seq_nt, is_query_slf=is_input_slf, is_target_nt=True,
+                                   is_target_slf=True, translation_table=translation_table)
 
-        seq2pang, align_file = map_input_gene_to_family_all_aln(align_file, output, pangenome)
+    seq2pang, align_file = map_input_gene_to_family_all_aln(align_file, output, pangenome)
 
     return align_file, seq2pang
 
diff --git a/ppanggolin/cluster/cluster.py b/ppanggolin/cluster/cluster.py
index c7d3d1f3..faab8873 100644
--- a/ppanggolin/cluster/cluster.py
+++ b/ppanggolin/cluster/cluster.py
@@ -8,7 +8,7 @@
 from collections import defaultdict
 import os
 import argparse
-from typing import TextIO, Tuple, Dict, Set
+from typing import Tuple, Dict, Set
 from pathlib import Path
 import time
 
@@ -20,11 +20,10 @@
 from ppanggolin.pangenome import Pangenome
 from ppanggolin.genome import Gene
 from ppanggolin.geneFamily import GeneFamily
-from ppanggolin.utils import read_compressed_or_not, restricted_float
+from ppanggolin.utils import read_compressed_or_not, restricted_float, create_tmpdir
 from ppanggolin.formats.writeBinaries import write_pangenome, erase_pangenome
 from ppanggolin.formats.readBinaries import check_pangenome_info, write_gene_sequences_from_pangenome_file
 from ppanggolin.formats.writeSequences import write_gene_sequences_from_annotations, translate_genes
-from ppanggolin.utils import mk_outdir
 
 
 # Global functions
@@ -43,14 +42,14 @@ def check_pangenome_former_clustering(pangenome: Pangenome, force: bool = False)
 
 
 # Clustering functions
-def check_pangenome_for_clustering(pangenome: Pangenome, tmp_file: TextIO, force: bool = False,
+def check_pangenome_for_clustering(pangenome: Pangenome, sequences: Path, force: bool = False,
                                    disable_bar: bool = False):
     """
     Check the pangenome statuses and write the gene sequences in the provided tmpFile.
     (whether they are written in the .h5 file or currently in memory)
 
     :param pangenome: Annotated Pangenome
-    :param tmp_file: Temporary file
+    :param sequences: Path to write the sequences
     :param force: Force to write on existing pangenome information
     :param disable_bar: Allow to disable progress bar
     """
@@ -58,18 +57,19 @@ def check_pangenome_for_clustering(pangenome: Pangenome, tmp_file: TextIO, force
     if pangenome.status["geneSequences"] in ["Computed", "Loaded"]:
         logging.getLogger("PPanGGOLiN").debug("Write sequences from annotation loaded in pangenome")
         # we append the gene ids by 'ppanggolin' to avoid crashes from mmseqs when sequence IDs are only numeric.
-        write_gene_sequences_from_annotations(pangenome.genes, tmp_file, add="ppanggolin_", disable_bar=disable_bar)
+        write_gene_sequences_from_annotations(pangenome.genes, sequences, add="ppanggolin_",
+                                              compress=False, disable_bar=disable_bar)
     elif pangenome.status["geneSequences"] == "inFile":
         logging.getLogger("PPanGGOLiN").debug("Write sequences from pangenome file")
-        write_gene_sequences_from_pangenome_file(pangenome.file, tmp_file, add="ppanggolin_",
-                                                 disable_bar=disable_bar)  # write CDS sequences to the tmpFile
+        write_gene_sequences_from_pangenome_file(pangenome.file, sequences, add="ppanggolin_",
+                                                 compress=False, disable_bar=disable_bar)  # write CDS sequences to the tmpFile
     else:
-        tmp_file.close()  # closing the tmp file since an exception will be raised.
         raise Exception("The pangenome does not include gene sequences, thus it is impossible to cluster "
                         "the genes in gene families. Either provide clustering results (see --clusters), "
                         "or provide a way to access the gene sequence during the annotation step "
                         "(having the fasta in the gff files, or providing the fasta files through the --fasta option)")
 
+
 def first_clustering(sequences: Path, tmpdir: Path, cpu: int = 1, code: int = 11, coverage: float = 0.8,
                      identity: float = 0.8, mode: int = 1) -> Tuple[Path, Path]:
     """
@@ -85,7 +85,7 @@ def first_clustering(sequences: Path, tmpdir: Path, cpu: int = 1, code: int = 11
 
     :return: path to representative sequence file and path to tsv clustering result
     """
-    seqdb = translate_genes(sequences=sequences, db_name='aa_db', tmpdir=tmpdir, cpu=cpu,
+    seqdb = translate_genes(sequences=sequences, tmpdir=tmpdir, cpu=cpu,
                             is_single_line_fasta=True, code=code)
     logging.getLogger("PPanGGOLiN").info("Clustering sequences...")
     cludb = tmpdir / 'cluster_db'
@@ -294,34 +294,23 @@ def clustering(pangenome: Pangenome, tmpdir: Path, cpu: int = 1, defrag: bool =
     :param disable_bar: Disable the progress bar during clustering.
     :param keep_tmp_files: Keep temporary files (useful for debugging).
     """
-
-    if keep_tmp_files:
-        date = time.strftime("_%Y-%m-%d_%H-%M-%S", time.localtime())
-
-        dir_name = f'clustering_tmpdir_{date}_PID{os.getpid()}'
-        tmp_path = Path(tmpdir) / dir_name
-        mk_outdir(tmp_path, force=True)
-    else:
-        newtmpdir = tempfile.TemporaryDirectory(dir=tmpdir)
-        tmp_path = Path(newtmpdir.name)
-
-    sequence_path = tmp_path/'nucleotid_sequences'
-    with open(sequence_path, "w") as sequence_file:
-        check_pangenome_for_clustering(pangenome, sequence_file, force, disable_bar=disable_bar)
+    date = time.strftime("_%Y-%m-%d_%H-%M-%S", time.localtime())
+    dir_name = f'clustering_tmpdir_{date}_PID{os.getpid()}'
+    with create_tmpdir(tmpdir, basename=dir_name, keep_tmp=keep_tmp_files) as tmp_path:
+        sequence_path = tmp_path/'nucleotide_sequences.fna'
+        check_pangenome_for_clustering(pangenome, sequence_path, force, disable_bar=disable_bar)
         logging.getLogger("PPanGGOLiN").info("Clustering all of the genes sequences...")
         rep, tsv = first_clustering(sequence_path, tmp_path, cpu, code, coverage, identity, mode)
 
-    fam2seq = read_faa(rep)
-    if not defrag:
-        logging.getLogger("PPanGGOLiN").debug("No defragmentation")
-        genes2fam, _ = read_tsv(tsv)
-    else:
-        logging.getLogger("PPanGGOLiN").info("Associating fragments to their original gene family...")
-        aln = align_rep(rep, tmp_path, cpu, coverage, identity)
-        genes2fam, fam2seq = refine_clustering(tsv, aln, fam2seq)
-        pangenome.status["defragmented"] = "Computed"
-    if not keep_tmp_files:
-        newtmpdir.cleanup()
+        fam2seq = read_faa(rep)
+        if not defrag:
+            logging.getLogger("PPanGGOLiN").debug("No defragmentation")
+            genes2fam, _ = read_tsv(tsv)
+        else:
+            logging.getLogger("PPanGGOLiN").info("Associating fragments to their original gene family...")
+            aln = align_rep(rep, tmp_path, cpu, coverage, identity)
+            genes2fam, fam2seq = refine_clustering(tsv, aln, fam2seq)
+            pangenome.status["defragmented"] = "Computed"
     read_fam2seq(pangenome, fam2seq)
     read_gene2fam(pangenome, genes2fam, disable_bar=disable_bar)
 
@@ -512,6 +501,7 @@ def parser_clust(parser: argparse.ArgumentParser):
                             "with their original gene family.")
     clust.add_argument("--translation_table", required=False, default="11",
                        help="Translation table (genetic code) to use.")
+    # clust.add_argument("--compress")
 
     clust.add_argument("-c", "--cpu", required=False, default=1, type=int, help="Number of available cpus")
 
diff --git a/ppanggolin/formats/readBinaries.py b/ppanggolin/formats/readBinaries.py
index bb0f02d5..4c90be2c 100644
--- a/ppanggolin/formats/readBinaries.py
+++ b/ppanggolin/formats/readBinaries.py
@@ -4,7 +4,7 @@
 # default libraries
 import logging
 from pathlib import Path
-from typing import TextIO, Dict, Any, Set, List, Tuple
+from typing import Dict, Any, Set, List, Tuple
 from collections import defaultdict
 
 # installed libraries
@@ -17,6 +17,7 @@
 from ppanggolin.geneFamily import GeneFamily
 from ppanggolin.region import Region, Spot, Module
 from ppanggolin.metadata import Metadata
+from ppanggolin.utils import write_compressed_or_not
 
 
 class Genedata:
@@ -26,7 +27,7 @@ class Genedata:
     """
 
     def __init__(self, start: int, stop: int, strand: str, gene_type: str, position: int, name: str, product: str,
-                 genetic_code: int, coordinates:List[Tuple[int]] = None):
+                 genetic_code: int, coordinates: List[Tuple[int]] = None):
         """Constructor method
 
         :param start: Gene start position
@@ -60,6 +61,7 @@ def __eq__(self, other):
             and self.genetic_code == other.genetic_code \
             and self.coordinates == other.coordinates \
 
+
     def __hash__(self):
         return hash((self.start, self.stop, self.strand, self.gene_type, self.position,
                      self.name, self.product, self.genetic_code, tuple(self.coordinates)))
@@ -172,9 +174,10 @@ def read_genedata(h5f: tables.File) -> Dict[int, Genedata]:
     for row in read_chunks(table, chunk=20000):
         start = int(row["start"])
         stop = int(row["stop"])
-        
-        if "has_joined_coordinates" in row.dtype.names and row["has_joined_coordinates"]: # manage gene with joined coordinates if the info exists
-            
+
+        if "has_joined_coordinates" in row.dtype.names and row["has_joined_coordinates"]:
+            # manage gene with joined coordinates if the info exists
+
             try:
                 coordinates = genedata_id_to_coordinates[row["genedata_id"]]
             except KeyError:
@@ -183,7 +186,6 @@ def read_genedata(h5f: tables.File) -> Dict[int, Genedata]:
         else:
             coordinates = [(start, stop)]
 
-
         genedata = Genedata(start=start,
                             stop=stop,
                             strand=row["strand"].decode(),
@@ -199,6 +201,7 @@ def read_genedata(h5f: tables.File) -> Dict[int, Genedata]:
 
     return genedata_id2genedata
 
+
 def read_join_coordinates(h5f: tables.File) -> Dict[str, List[Tuple[int, int]]]:
     """
     Read join coordinates from a HDF5 file and return a dictionary mapping genedata_id to coordinates.
@@ -207,7 +210,7 @@ def read_join_coordinates(h5f: tables.File) -> Dict[str, List[Tuple[int, int]]]:
     :return: A dictionary mapping genedata_id to a list of tuples representing start and stop coordinates.
     """
     genedata_id_to_coordinates = defaultdict(list)
-    
+
     if not hasattr(h5f.root.annotations, "joinedCoordinates"):
         # then the pangenome file has no joined annotations 
         # or has been made before the joined annotations coordinates
@@ -218,7 +221,8 @@ def read_join_coordinates(h5f: tables.File) -> Dict[str, List[Tuple[int, int]]]:
     for row in read_chunks(table, chunk=20000):
         genedata_id = row["genedata_id"]
 
-        genedata_id_to_coordinates[genedata_id].append((int(row["coordinate_rank"]), int(row["start"]), int(row["stop"])))
+        genedata_id_to_coordinates[genedata_id].append(
+            (int(row["coordinate_rank"]), int(row["start"]), int(row["stop"])))
 
     # sort coordinate by their rank
     genedata_id_to_sorted_coordinates = {}
@@ -242,21 +246,21 @@ def read_sequences(h5f: tables.File) -> dict:
     return seqid2seq
 
 
-def get_non_redundant_gene_sequences_from_file(pangenome_filename: str, file_obj: TextIO, add: str = '',
+def get_non_redundant_gene_sequences_from_file(pangenome_filename: str, output: Path, add: str = '',
                                                disable_bar: bool = False):
     """
     Writes the non-redundant CDS sequences of the Pangenome object to a File object that can be filtered or not by a list of CDS,
     and adds the eventual str 'add' in front of the identifiers. Loads the sequences from a .h5 pangenome file.
 
     :param pangenome_filename: Name of the pangenome file
-    :param file_obj: Name of the output file
+    :param output: Path to the output file
     :param add: Add a prefix to sequence header
     :param disable_bar: disable progress bar
 
     """
 
-    logging.getLogger("PPanGGOLiN").info(
-        f"Extracting and writing non redundant CDS sequences from {pangenome_filename} to {file_obj.name}")
+    logging.getLogger("PPanGGOLiN").info(f"Extracting and writing non redundant CDS sequences from {pangenome_filename}"
+                                         f" to {output.absolute()}")
 
     with tables.open_file(pangenome_filename, "r", driver_core_backing_store=0) as h5f:
 
@@ -269,41 +273,41 @@ def get_non_redundant_gene_sequences_from_file(pangenome_filename: str, file_obj
             seqid2cds_name[row["seqid"]] = row["gene"].decode()
 
         table = h5f.root.annotations.sequences
-        for row in tqdm(read_chunks(table, chunk=20000), total=table.nrows, unit="gene", disable=disable_bar):
-            cds_name = seqid2cds_name[row["seqid"]]
-            file_obj.write(f'>{add}{cds_name}\n')
-            file_obj.write(f'{row["dna"].decode()}\n')
-
-        file_obj.flush()
+        with open(output, "w") as file_obj:
+            for row in tqdm(read_chunks(table, chunk=20000), total=table.nrows, unit="gene", disable=disable_bar):
+                cds_name = seqid2cds_name[row["seqid"]]
+                file_obj.write(f'>{add}{cds_name}\n')
+                file_obj.write(f'{row["dna"].decode()}\n')
 
 
-def write_gene_sequences_from_pangenome_file(pangenome_filename: str, file_obj: TextIO, list_cds: iter = None,
-                                             add: str = '', disable_bar: bool = False):
+def write_gene_sequences_from_pangenome_file(pangenome_filename: str, output: Path, list_cds: iter = None,
+                                             add: str = '', compress: bool = False, disable_bar: bool = False):
     """
     Writes the CDS sequences of the Pangenome object to a File object that can be filtered or not by a list of CDS,
     and adds the eventual str 'add' in front of the identifiers. Loads the sequences from a .h5 pangenome file.
 
     :param pangenome_filename: Name of the pangenome file
-    :param file_obj: Name of the output file
+    :param output: Path to the sequences file
     :param list_cds: An iterable object of CDS
     :param add: Add a prefix to sequence header
+    :param compress: Compress the output file
     :param disable_bar: Prevent to print disable progress bar
     """
-    logging.getLogger("PPanGGOLiN").info(
-        f"Extracting and writing CDS sequences from a {pangenome_filename} file to a fasta file...")
-    h5f = tables.open_file(pangenome_filename, "r", driver_core_backing_store=0)
-
-    table = h5f.root.annotations.geneSequences
-    list_cds = set(list_cds) if list_cds is not None else None
-    seqid2seq = read_sequences(h5f)
-    for row in tqdm(read_chunks(table, chunk=20000), total=table.nrows, unit="gene", disable=disable_bar):
-        # Read the table chunk per chunk otherwise RAM dies on big pangenomes
-        name_cds = row["gene"].decode()
-        if row["type"] == b"CDS" and (list_cds is None or name_cds in list_cds):
-            file_obj.write('>' + add + name_cds + "\n")
-            file_obj.write(seqid2seq[row["seqid"]] + "\n")
-    file_obj.flush()
-    h5f.close()
+    logging.getLogger("PPanGGOLiN").info(f"Extracting and writing CDS sequences from a {pangenome_filename} "
+                                         "file to a fasta file...")
+    with tables.open_file(pangenome_filename, "r", driver_core_backing_store=0) as h5f:
+        table = h5f.root.annotations.geneSequences
+        list_cds = set(list_cds) if list_cds is not None else None
+        seqid2seq = read_sequences(h5f)
+        with write_compressed_or_not(output, compress) as file_obj:
+            for row in tqdm(read_chunks(table, chunk=20000), total=table.nrows, unit="gene", disable=disable_bar):
+                # Read the table chunk per chunk otherwise RAM dies on big pangenomes
+                name_cds = row["gene"].decode()
+                if row["type"] == b"CDS" and (list_cds is None or name_cds in list_cds):
+                    file_obj.write('>' + add + name_cds + "\n")
+                    file_obj.write(seqid2seq[row["seqid"]] + "\n")
+    logging.getLogger("PPanGGOLiN").debug("Gene sequences from pangenome file was written to "
+                                          f"{output.absolute()}{'.gz' if compress else ''}")
 
 
 def read_graph(pangenome: Pangenome, h5f: tables.File, disable_bar: bool = False):
@@ -520,7 +524,8 @@ def read_genes(pangenome: Pangenome, table: tables.Table, genedata_dict: Dict[in
             local = ""
         gene.fill_annotations(start=genedata.start, stop=genedata.stop, strand=genedata.strand,
                               gene_type=genedata.gene_type, name=genedata.name, position=genedata.position,
-                              genetic_code=genedata.genetic_code, product=genedata.product, local_identifier=local, coordinates=genedata.coordinates)
+                              genetic_code=genedata.genetic_code, product=genedata.product, local_identifier=local,
+                              coordinates=genedata.coordinates)
         gene.is_fragment = row["is_fragment"]
         if link:
             contig = pangenome.get_contig(identifier=int(row["contig"]))
diff --git a/ppanggolin/formats/writeSequences.py b/ppanggolin/formats/writeSequences.py
index bc88ec20..272e1a8a 100644
--- a/ppanggolin/formats/writeSequences.py
+++ b/ppanggolin/formats/writeSequences.py
@@ -6,9 +6,8 @@
 import logging
 import re
 import subprocess
-import sys
 from pathlib import Path
-from typing import TextIO, Dict, Set, Iterable, Union
+from typing import Dict, Set, Iterable, Union
 import tempfile
 import shutil
 
@@ -19,7 +18,8 @@
 from ppanggolin.pangenome import Pangenome
 from ppanggolin.geneFamily import GeneFamily
 from ppanggolin.genome import Gene, Organism
-from ppanggolin.utils import write_compressed_or_not, mk_outdir, read_compressed_or_not, restricted_float, detect_filetype
+from ppanggolin.utils import (write_compressed_or_not, mk_outdir, create_tmpdir, read_compressed_or_not,
+                              restricted_float, detect_filetype)
 from ppanggolin.formats.readBinaries import check_pangenome_info, write_gene_sequences_from_pangenome_file
 
 module_regex = re.compile(r'^module_\d+')  # \d == [0-9]
@@ -115,24 +115,25 @@ def check_pangenome_to_write_sequences(pangenome: Pangenome, regions: str = None
                          need_spots=need_spots, need_modules=need_modules, disable_bar=disable_bar)
 
 
-def write_gene_sequences_from_annotations(genes_to_write: Iterable[Gene], file_obj: TextIO, add: str = '',
-                                          disable_bar: bool = False):
+def write_gene_sequences_from_annotations(genes_to_write: Iterable[Gene], output: Path, add: str = '',
+                                          compress: bool = False, disable_bar: bool = False):
     """
     Writes the CDS sequences to a File object,
     and adds the string provided through `add` in front of it.
     Loads the sequences from previously computed or loaded annotations.
 
     :param genes_to_write: Genes to write.
-    :param file_obj: Output file to write sequences.
+    :param output: Path to output file to write sequences.
     :param add: Add prefix to gene ID.
+    :param compress: Compress the file in .gz
     :param disable_bar: Disable progress bar.
     """
-    logging.getLogger("PPanGGOLiN").info(f"Writing all CDS sequences in {file_obj.name}")
-    for gene in tqdm(genes_to_write, unit="gene", disable=disable_bar):
-        if gene.type == "CDS":
-            file_obj.write(f'>{add}{gene.ID}\n')
-            file_obj.write(f'{gene.dna}\n')
-    file_obj.flush()
+    logging.getLogger("PPanGGOLiN").info(f"Writing all CDS sequences in {output.absolute()}")
+    with write_compressed_or_not(output, compress) as file_obj:
+        for gene in tqdm(genes_to_write, unit="gene", disable=disable_bar):
+            if gene.type == "CDS":
+                file_obj.write(f'>{add}{gene.ID}\n')
+                file_obj.write(f'{gene.dna}\n')
 
 
 def create_mmseqs_db(sequences: Iterable[Path], db_name: str, tmpdir: Path, db_mode: int = 0, db_type: int = 0) -> Path:
@@ -159,12 +160,11 @@ def create_mmseqs_db(sequences: Iterable[Path], db_name: str, tmpdir: Path, db_m
     return seq_nucdb
 
 
-def translate_genes(sequences: Union[Path, Iterable[Path]], db_name: str, tmpdir: Path, cpu: int = 1,
+def translate_genes(sequences: Union[Path, Iterable[Path]], tmpdir: Path, cpu: int = 1,
                     is_single_line_fasta: bool = False, code: int = 11) -> Path:
     """Translate nucleotide sequences into MMSeqs2 amino acid sequences database
 
     :param sequences: File with the nucleotide sequences
-    :param db_name: name of the output database
     :param tmpdir: Temporary directory to save the MMSeqs2 files
     :param cpu: Number of available threads to use
     :param is_single_line_fasta: Allow to use soft link in MMSeqs2 database
@@ -172,10 +172,10 @@ def translate_genes(sequences: Union[Path, Iterable[Path]], db_name: str, tmpdir
 
     :return: Path to the MMSeqs2 database
     """
-    seq_nucdb = create_mmseqs_db([sequences] if isinstance(sequences, Path) else sequences, f'{db_name}_nt_db',
+    seq_nucdb = create_mmseqs_db([sequences] if isinstance(sequences, Path) else sequences, 'nucleotides_db',
                                  tmpdir, db_mode=1 if is_single_line_fasta else 0, db_type=2)
     logging.getLogger("PPanGGOLiN").debug("Translate sequence ...")
-    seqdb = tmpdir / db_name
+    seqdb = tmpdir / 'translate_db'
     cmd = list(map(str, ["mmseqs", "translatenucs", seq_nucdb, seqdb, "--threads", cpu, "--translation-table", code]))
     logging.getLogger("PPanGGOLiN").debug(" ".join(cmd))
     subprocess.run(cmd, stdout=subprocess.DEVNULL, check=True)
@@ -183,7 +183,7 @@ def translate_genes(sequences: Union[Path, Iterable[Path]], db_name: str, tmpdir
 
 
 def write_gene_sequences(pangenome: Pangenome, output: Path, genes: str, soft_core: float = 0.95,
-                         compress: bool = False, disable_bar: bool = False):
+                         compress: bool = False, disable_bar: bool = False) -> Path:
     """
     Write all nucleotide CDS sequences
 
@@ -213,16 +213,16 @@ def write_gene_sequences(pangenome: Pangenome, output: Path, genes: str, soft_co
         genes_to_write.extend(fam.genes)
 
     logging.getLogger("PPanGGOLiN").info(f"There are {len(genes_to_write)} genes to write")
-    with write_compressed_or_not(outpath, compress) as fasta:
-        if pangenome.status["geneSequences"] in ["inFile"]:
-            write_gene_sequences_from_pangenome_file(pangenome.file, fasta, set([gene.ID for gene in genes_to_write]),
-                                                     disable_bar=disable_bar)
-        elif pangenome.status["geneSequences"] in ["Computed", "Loaded"]:
-            write_gene_sequences_from_annotations(genes_to_write, fasta, disable_bar=disable_bar)
-        else:
-            # this should never happen if the pangenome has been properly checked before launching this function.
-            raise AttributeError("The pangenome does not include gene sequences")
-    logging.getLogger("PPanGGOLiN").info(f"Done writing the gene sequences : '{outpath}'")
+    if pangenome.status["geneSequences"] in ["inFile"]:
+        write_gene_sequences_from_pangenome_file(pangenome.file, outpath, set([gene.ID for gene in genes_to_write]),
+                                                 compress=compress, disable_bar=disable_bar)
+    elif pangenome.status["geneSequences"] in ["Computed", "Loaded"]:
+        write_gene_sequences_from_annotations(genes_to_write, outpath, compress=compress, disable_bar=disable_bar)
+    else:
+        # this should never happen if the pangenome has been properly checked before launching this function.
+        raise AttributeError("The pangenome does not include gene sequences")
+    logging.getLogger("PPanGGOLiN").info(f"Done writing the gene sequences : '{outpath}{'.gz' if compress else ''}'")
+    return outpath
 
 
 def write_gene_protein_sequences(pangenome: Pangenome, output: Path, proteins: str, soft_core: float = 0.95,
@@ -241,23 +241,26 @@ def write_gene_protein_sequences(pangenome: Pangenome, output: Path, proteins: s
     :param code: Genetic code use to translate nucleotide sequences to protein sequences
     :param disable_bar: Disable progress bar
     """
-    tmpdir = tmp / "translateGenes" if tmp is not None else Path(f"{tempfile.gettempdir()}/translateGenes")
-    mk_outdir(tmpdir, True, True)
-
-    write_gene_sequences(pangenome, tmpdir, proteins, soft_core, compress, disable_bar)
-
-    pangenome_sequences = tmpdir / f"{proteins}_genes.fna"
-    translate_db = translate_genes(sequences=pangenome_sequences, db_name='aa_db', tmpdir=tmpdir,
-                                   cpu=cpu, is_single_line_fasta=True, code=code)
-    outpath = output / f"{proteins}_protein_genes.fna"
-    cmd = list(map(str, ["mmseqs", "convert2fasta", translate_db, outpath]))
-    logging.getLogger("PPanGGOLiN").debug(" ".join(cmd))
-    subprocess.run(cmd, stdout=subprocess.DEVNULL, check=True)
-    logging.getLogger("PPanGGOLiN").info(f"Done writing the gene sequences : '{outpath}'")
-
-    if not keep_tmp:
-        logging.getLogger("PPanGGOLiN").debug("Clean temporary directory")
-        shutil.rmtree(tmpdir)
+    with create_tmpdir(tmp if tmp is not None else Path(tempfile.gettempdir()),
+                       basename="translateGenes", keep_tmp=keep_tmp) as tmpdir:
+
+        write_gene_sequences(pangenome, tmpdir, proteins, soft_core, compress, disable_bar)
+
+        pangenome_sequences = tmpdir / f"{proteins}_genes.fna{'.gz' if compress else ''}"
+        translate_db = translate_genes(sequences=pangenome_sequences, tmpdir=tmpdir,
+                                       cpu=cpu, is_single_line_fasta=True, code=code)
+        outpath = output / f"{proteins}_protein_genes.fna"
+        cmd = list(map(str, ["mmseqs", "convert2fasta", translate_db, outpath]))
+        logging.getLogger("PPanGGOLiN").debug(" ".join(cmd))
+        subprocess.run(cmd, stdout=subprocess.DEVNULL, check=True)
+        if compress:
+            with write_compressed_or_not(outpath, compress) as compress_file:
+                with open(outpath, "r") as sequence_file:
+                    shutil.copyfileobj(sequence_file, compress_file)
+            outpath.unlink()
+            logging.getLogger("PPanGGOLiN").info(f"Done writing the gene sequences : '{outpath}.gz'")
+        else:
+            logging.getLogger("PPanGGOLiN").info(f"Done writing the gene sequences : '{outpath}'")
 
 
 def select_families(pangenome: Pangenome, partition: str, type_name: str, soft_core: float = 0.95) -> Set[GeneFamily]:
@@ -331,12 +334,11 @@ def write_fasta_gene_fam(pangenome: Pangenome, output: Path, gene_families: str,
     genefams = select_families(pangenome, gene_families, "representative nucleotide sequences of the gene families",
                                soft_core)
 
-    with write_compressed_or_not(outpath, compress) as fasta:
-        write_gene_sequences_from_pangenome_file(pangenome.file, fasta, [fam.name for fam in genefams],
-                                                 disable_bar=disable_bar)
+    write_gene_sequences_from_pangenome_file(pangenome.file, outpath, [fam.name for fam in genefams],
+                                             compress=compress, disable_bar=disable_bar)
 
-    logging.getLogger("PPanGGOLiN").info(
-        f"Done writing the representative nucleotide sequences of the gene families : '{outpath}'")
+    logging.getLogger("PPanGGOLiN").info("Done writing the representative nucleotide sequences "
+                                         f"of the gene families : '{outpath}{'.gz' if compress else ''}")
 
 
 def write_fasta_prot_fam(pangenome: Pangenome, output: Path, prot_families: str, soft_core: float = 0.95,
@@ -361,8 +363,8 @@ def write_fasta_prot_fam(pangenome: Pangenome, output: Path, prot_families: str,
         for fam in tqdm(genefams, unit="prot families", disable=disable_bar):
             fasta.write('>' + fam.name + "\n")
             fasta.write(fam.sequence + "\n")
-    logging.getLogger("PPanGGOLiN").info(
-        f"Done writing the representative amino acid sequences of the gene families : '{outpath}'")
+    logging.getLogger("PPanGGOLiN").info(f"Done writing the representative amino acid sequences of the gene families:"
+                                         f"'{outpath}{'.gz' if compress else ''}'")
 
 
 def read_fasta_or_gff(file_path: Path) -> Dict[str, str]:
@@ -526,7 +528,8 @@ def write_regions_sequences(pangenome: Pangenome, output: Path, regions: str, fa
             fasta.write(f">{region.name}\n")
             fasta.write(
                 write_spaced_fasta(genome_sequence[region.contig.name][region.start:region.stop], 60))
-    logging.getLogger("PPanGGOLiN").info(f"Done writing the regions nucleotide sequences: '{outname}'")
+    logging.getLogger("PPanGGOLiN").info(f"Done writing the regions nucleotide sequences: "
+                                         f"'{outname}{'.gz' if compress else ''}'")
 
 
 def write_sequence_files(pangenome: Pangenome, output: Path, fasta: Path = None, anno: Path = None,
diff --git a/ppanggolin/projection/projection.py b/ppanggolin/projection/projection.py
index cddfd338..c06331e9 100644
--- a/ppanggolin/projection/projection.py
+++ b/ppanggolin/projection/projection.py
@@ -19,7 +19,6 @@
 from tqdm import tqdm
 import networkx as nx
 import yaml
-import pandas as pd
 
 # # local libraries
 from ppanggolin.annotate.synta import read_fasta, get_dna_sequence
@@ -39,8 +38,10 @@
 from ppanggolin.genome import Organism
 from ppanggolin.geneFamily import GeneFamily
 from ppanggolin.region import Region, Spot, Module
-from ppanggolin.formats.writeFlatGenomes import write_proksee_organism, manage_module_colors, write_gff_file, write_tsv_genome_file
-from ppanggolin.formats.writeFlatPangenome import summarize_spots, summarize_genome, write_summaries_in_tsv, write_rgp_table
+from ppanggolin.formats.writeFlatGenomes import write_proksee_organism, manage_module_colors, write_gff_file, \
+    write_tsv_genome_file
+from ppanggolin.formats.writeFlatPangenome import summarize_spots, summarize_genome, write_summaries_in_tsv, \
+    write_rgp_table
 from ppanggolin.formats.writeSequences import read_genome_file
 
 
@@ -53,7 +54,8 @@ class NewSpot(Spot):
     def __str__(self):
         return f'new_spot_{str(self.ID)}'
 
-def check_pangenome_for_projection(pangenome: Pangenome, fast_aln:bool):
+
+def check_pangenome_for_projection(pangenome: Pangenome, fast_aln: bool):
     """
     Check the status of a pangenome and determine whether projection is possible.
 
@@ -76,7 +78,6 @@ def check_pangenome_for_projection(pangenome: Pangenome, fast_aln:bool):
     predict_rgp = True
     project_spots = True
 
-
     if pangenome.status["partitioned"] not in ["Computed", "Loaded", "inFile"]:
         raise NameError("The provided pangenome has not been partitioned. "
                         "Annotation of an external genome is therefore not possible. "
@@ -99,7 +100,7 @@ def check_pangenome_for_projection(pangenome: Pangenome, fast_aln:bool):
                                              "Projection of modules into the provided genome will not be performed.")
 
         project_modules = False
-    
+
     if pangenome.status["geneSequences"] not in ["Loaded", "Computed", "inFile"] and not fast_aln:
         raise Exception("The provided pangenome has no gene sequences. "
                         "Projection is still possible with the --fast option to use representative "
@@ -108,22 +109,19 @@ def check_pangenome_for_projection(pangenome: Pangenome, fast_aln:bool):
     if pangenome.status["geneFamilySequences"] not in ["Loaded", "Computed", "inFile"]:
         raise Exception("The provided pangenome has no gene families sequences. "
                         "This is not possible to annotate an input genome to this pangenome.")
-    
+
     return predict_rgp, project_spots, project_modules
 
 
-def manage_input_genomes_annotation(pangenome, input_mode, anno, fasta, 
-                                    organism_name, circular_contigs, pangenome_params, 
-                                    cpu, use_pseudo, disable_bar, tmpdir, config):
-    """
-    """
+def manage_input_genomes_annotation(pangenome, input_mode, anno, fasta, organism_name, circular_contigs,
+                                    pangenome_params, cpu, use_pseudo, disable_bar, tmpdir, config):
     genome_name_to_path = None
 
     if input_mode == "multiple":
         if anno:
             input_type = "annotation"
             genome_name_to_path = parse_input_paths_file(anno)
-        
+
         elif fasta:
             input_type = "fasta"
             genome_name_to_path = parse_input_paths_file(fasta)
@@ -134,19 +132,19 @@ def manage_input_genomes_annotation(pangenome, input_mode, anno, fasta,
         if anno:
             input_type = "annotation"
             genome_name_to_path = {organism_name: {"path": anno,
-                                                            "circular_contigs": circular_contigs}}
-        
+                                                   "circular_contigs": circular_contigs}}
+
         elif fasta:
             input_type = "fasta"
             genome_name_to_path = {organism_name: {"path": fasta,
-                                                            "circular_contigs": circular_contigs}}
+                                                   "circular_contigs": circular_contigs}}
 
     if input_type == "annotation":
         check_input_names(pangenome, genome_name_to_path)
 
         organisms, org_2_has_fasta = read_annotation_files(genome_name_to_path, cpu=cpu, pseudo=use_pseudo,
-                    disable_bar=disable_bar)
-        
+                                                           disable_bar=disable_bar)
+
         if not all((has_fasta for has_fasta in org_2_has_fasta.values())):
             organisms_with_no_fasta = {org for org, has_fasta in org_2_has_fasta.items() if not has_fasta}
             if fasta:
@@ -154,9 +152,9 @@ def manage_input_genomes_annotation(pangenome, input_mode, anno, fasta,
 
             else:
                 raise ValueError(f"You provided GFF files for {len(organisms_with_no_fasta)} (out of {len(organisms)}) "
-                                "genomes without associated sequence data, and you did not provide "
-                                "FASTA sequences using the --fasta or --single_fasta_file options. Therefore, it is impossible to project the pangenome onto the input genomes. "
-                                f"The following genomes have no associated sequence data: {', '.join(o.name for o in organisms_with_no_fasta)}")
+                                 "genomes without associated sequence data, and you did not provide "
+                                 "FASTA sequences using the --fasta or --single_fasta_file options. Therefore, it is impossible to project the pangenome onto the input genomes. "
+                                 f"The following genomes have no associated sequence data: {', '.join(o.name for o in organisms_with_no_fasta)}")
 
     elif input_type == "fasta":
         annotate_param_names = ["norna", "kingdom",
@@ -165,22 +163,25 @@ def manage_input_genomes_annotation(pangenome, input_mode, anno, fasta,
         annotate_params = manage_annotate_param(annotate_param_names, pangenome_params.annotate, config)
 
         check_input_names(pangenome, genome_name_to_path)
-        organisms = annotate_fasta_files(genome_name_to_fasta_path=genome_name_to_path,  tmpdir=tmpdir, cpu=cpu,
-                            translation_table=int(pangenome_params.cluster.translation_table), norna=annotate_params.norna, kingdom=annotate_params.kingdom,
-                            allow_overlap=annotate_params.allow_overlap, procedure=annotate_params.prodigal_procedure, disable_bar=disable_bar)
+        organisms = annotate_fasta_files(genome_name_to_fasta_path=genome_name_to_path, tmpdir=tmpdir, cpu=cpu,
+                                         translation_table=int(pangenome_params.cluster.translation_table),
+                                         norna=annotate_params.norna, kingdom=annotate_params.kingdom,
+                                         allow_overlap=annotate_params.allow_overlap,
+                                         procedure=annotate_params.prodigal_procedure, disable_bar=disable_bar)
     return organisms, genome_name_to_path, input_type
 
 
-def write_projection_results(pangenome:Pangenome, organisms:Set[Organism], input_org_2_rgps:Dict[Organism, Set[Region]],
-                            input_org_to_spots:Dict[Organism, Set[Spot]],
-                            input_orgs_to_modules:Dict[Organism, Set[Module]] ,
-                            input_org_to_lonely_genes_count:Dict[Organism, int],
-                            write_proksee:bool, write_gff:bool, write_table:bool,
-                            add_sequences:bool,
-                            genome_name_to_path:Dict[str,dict], input_type:str,
-                            output_dir:Path, dup_margin:float, soft_core:float,
-                            metadata_sep:str, compress:bool, 
-                            need_regions:bool, need_spots:bool, need_modules:bool):
+def write_projection_results(pangenome: Pangenome, organisms: Set[Organism],
+                             input_org_2_rgps: Dict[Organism, Set[Region]],
+                             input_org_to_spots: Dict[Organism, Set[Spot]],
+                             input_orgs_to_modules: Dict[Organism, Set[Module]],
+                             input_org_to_lonely_genes_count: Dict[Organism, int],
+                             write_proksee: bool, write_gff: bool, write_table: bool,
+                             add_sequences: bool,
+                             genome_name_to_path: Dict[str, dict], input_type: str,
+                             output_dir: Path, dup_margin: float, soft_core: float,
+                             metadata_sep: str, compress: bool,
+                             need_regions: bool, need_spots: bool, need_modules: bool):
     """
     Write the results of the projection of pangneome onto input genomes.
 
@@ -214,14 +215,12 @@ def write_projection_results(pangenome:Pangenome, organisms:Set[Organism], input
     # single_copy_families = get_single_copy_families(pangenome, dup_margin)
     # multigenics = pangenome.get_multigenics(pangenome_params.rgp.dup_margin)
 
-
-
-    # dup margin value here is specified in argument and is used to compute completeness. 
+    # dup margin value here is specified in argument and is used to compute completeness.
     # Thats mean it can be different than dup margin used in spot and RGPS.
 
-    pangenome_persistent_single_copy_families =  pangenome.get_single_copy_persistent_families(dup_margin = dup_margin, exclude_fragments=True)
+    pangenome_persistent_single_copy_families = pangenome.get_single_copy_persistent_families(dup_margin=dup_margin,
+                                                                                              exclude_fragments=True)
     pangenome_persistent_count = len([fam for fam in pangenome.gene_families if fam.named_partition == "persistent"])
-    
 
     soft_core_families = pangenome.soft_core_families(soft_core)
     exact_core_families = pangenome.exact_core_families()
@@ -234,17 +233,17 @@ def write_projection_results(pangenome:Pangenome, organisms:Set[Organism], input
         singleton_gene_count = input_org_to_lonely_genes_count[organism]
 
         org_summary = summarize_projected_genome(organism,
-                                                pangenome_persistent_count,
-                                                pangenome_persistent_single_copy_families,
-                                                soft_core_families=soft_core_families,
-                                                exact_core_families=exact_core_families,
-                                                input_org_rgps=input_org_2_rgps.get(organism, None), 
-                                                input_org_spots=input_org_to_spots.get(organism, None),
-                                                input_org_modules=input_orgs_to_modules.get(organism, None),
-                                                pangenome_file=pangenome.file,
-                                                singleton_gene_count=singleton_gene_count)
+                                                 pangenome_persistent_count,
+                                                 pangenome_persistent_single_copy_families,
+                                                 soft_core_families=soft_core_families,
+                                                 exact_core_families=exact_core_families,
+                                                 input_org_rgps=input_org_2_rgps.get(organism, None),
+                                                 input_org_spots=input_org_to_spots.get(organism, None),
+                                                 input_org_modules=input_orgs_to_modules.get(organism, None),
+                                                 pangenome_file=pangenome.file,
+                                                 singleton_gene_count=singleton_gene_count)
         summaries.append(org_summary)
-        
+
         yaml_outputfile = output_dir / organism.name / "projection_summary.yaml"
         write_summary_in_yaml(org_summary, yaml_outputfile)
 
@@ -264,7 +263,7 @@ def write_projection_results(pangenome:Pangenome, organisms:Set[Organism], input
                                    genome_sequences=genome_sequences, compress=compress)
 
         if write_gff:
-            if input_type == "annotation": # if the genome has not been annotated by PPanGGOLiN
+            if input_type == "annotation":  # if the genome has not been annotated by PPanGGOLiN
                 annotation_sources = {"rRNA": "external",
                                       "tRNA": "external",
                                       "CDS": "external"}
@@ -278,14 +277,13 @@ def write_projection_results(pangenome:Pangenome, organisms:Set[Organism], input
 
         if write_table:
             write_tsv_genome_file(organism, output_dir / organism.name, compress=compress, metadata_sep=metadata_sep,
-                   need_regions=need_regions, need_spots=need_spots, need_modules=need_modules)
+                                  need_regions=need_regions, need_spots=need_spots, need_modules=need_modules)
 
     output_file = output_dir / "summary_projection.tsv"
     write_summaries_in_tsv(summaries,
                            output_file=output_file,
                            dup_margin=dup_margin,
                            soft_core=soft_core)
-    
 
 
 def summarize_projected_genome(organism: Organism,
@@ -494,7 +492,7 @@ def check_input_names(pangenome, input_names):
             f"{len(duplicated_names)} provided genome name(s) already exist in the given pangenome: {' '.join(duplicated_names)}")
 
 
-def write_summary_in_yaml(summary_info:  Dict[str, Any], output_file: Path):
+def write_summary_in_yaml(summary_info: Dict[str, Any], output_file: Path):
     """
     Write summary information to a YAML file.
 
@@ -540,23 +538,25 @@ def annotate_input_genes_with_pangenome_families(pangenome: Pangenome, input_org
 
     logging.getLogger('PPanGGOLiN').info('Writing gene sequences of input genomes.')
 
-    for input_organism in input_organisms:
-        seq_outdir = output / input_organism.name
-        mk_outdir(seq_outdir, force=True)
-
-        seq_fasta_file = seq_outdir / "cds_sequences.fasta"
-
-        with open(seq_fasta_file, "w") as fh_out_faa:
-            write_gene_sequences_from_annotations(input_organism.genes, fh_out_faa, disable_bar=True,
-                                                  add="ppanggolin_")
+    input_genes = [gene for org in input_organisms for gene in org.genes]
 
-        seq_fasta_files.append(seq_fasta_file)
+    # for input_organism in input_organisms:
+    #     seq_outdir = output / input_organism.name
+    #     mk_outdir(seq_outdir, force=True)
+    #
+    #     seq_fasta_file = seq_outdir / "cds_sequences.fasta"
+    #
+    #     write_gene_sequences_from_annotations(input_organism.genes, seq_fasta_file, disable_bar=True, add="ppanggolin_")
+    #
+    #     seq_fasta_files.append(seq_fasta_file)
+    seq_fasta_file = output / 'input_genes.fasta'
 
-    with create_tmpdir(main_dir=tmpdir, basename="align_input_seq_tmpdir", keep_tmp=keep_tmp) as new_tmpdir:
+    write_gene_sequences_from_annotations(input_genes, seq_fasta_file, disable_bar=True, add='ppanggolin_')
 
+    with create_tmpdir(main_dir=tmpdir, basename="projection_tmp", keep_tmp=keep_tmp) as new_tmpdir:
         if use_representatives:
             _, seqid_to_gene_family = get_input_seq_to_family_with_rep(pangenome=pangenome,
-                                                                       sequence_files=seq_fasta_files,
+                                                                       sequence_files=seq_fasta_file,
                                                                        output=new_tmpdir, tmpdir=new_tmpdir,
                                                                        is_input_seq_nt=True, is_input_slf=True,
                                                                        cpu=cpu, no_defrag=no_defrag,
@@ -564,16 +564,18 @@ def annotate_input_genes_with_pangenome_families(pangenome: Pangenome, input_org
                                                                        translation_table=translation_table)
         else:
             _, seqid_to_gene_family = get_input_seq_to_family_with_all(pangenome=pangenome,
-                                                                       sequence_files=seq_fasta_files,
+                                                                       sequence_files=seq_fasta_file,
                                                                        output=new_tmpdir, tmpdir=new_tmpdir,
                                                                        is_input_seq_nt=True, is_input_slf=True,
                                                                        cpu=cpu, no_defrag=no_defrag, identity=identity,
                                                                        coverage=coverage,
                                                                        translation_table=translation_table,
                                                                        disable_bar=disable_bar)
+
     input_org_to_lonely_genes_count = {}
     for input_organism in input_organisms:
         org_outdir = output / input_organism.name
+        mk_outdir(org_outdir, force=True)
 
         seq_set = {gene.ID if gene.local_identifier == "" else gene.local_identifier for gene in input_organism.genes}
 
@@ -606,7 +608,7 @@ def annotate_input_genes_with_pangenome_families(pangenome: Pangenome, input_org
                     gene_family = GeneFamily(pangenome.max_fam_id, new_name)
 
                 pangenome.add_gene_family(gene_family)
-                
+
                 gene_family.partition = "Cloud"
                 lonely_genes.add(gene)
 
@@ -616,7 +618,7 @@ def annotate_input_genes_with_pangenome_families(pangenome: Pangenome, input_org
                     "The input genome contains a gene that aligns to a pangenome family "
                     f"which already contains a gene with the same ID ({gene_id}). "
                     f"The genome name has been appended to the family name: {new_name}")
-                
+
                 gene.ID = new_name
 
             # Add the gene to the gene family
@@ -624,7 +626,6 @@ def annotate_input_genes_with_pangenome_families(pangenome: Pangenome, input_org
 
         pangenome._mk_gene_getter()  # re-build the gene getter
 
-
         logging.getLogger('PPanGGOLiN').info(
             f"{input_organism.name} has {len(lonely_genes)}/{input_organism.number_of_genes()} "
             "specific genes that do not align to any gene of the pangenome.")
@@ -923,7 +924,7 @@ def predict_spot_in_one_organism(
         overlapping_match: int = 2,
         set_size: int = 3,
         exact_match: int = 1,
-        compress:bool = False) -> Set[Spot]:
+        compress: bool = False) -> Set[Spot]:
     """
     Predict spots for input organism RGPs.
 
@@ -960,10 +961,11 @@ def predict_spot_in_one_organism(
             input_org_node_to_rgps[border_node].add(rgp)
 
     if len(input_org_node_to_rgps) == 0:
-        logging.getLogger("PPanGGOLiN").debug(f"{organism_name}: no RGPs of the input genome will be associated with any spot of insertion "
-                                             "as they are on a contig border (or have "
-                                             f"less than {set_size} persistent gene families until the contig border). "
-                                             "Projection of spots stops here")
+        logging.getLogger("PPanGGOLiN").debug(
+            f"{organism_name}: no RGPs of the input genome will be associated with any spot of insertion "
+            "as they are on a contig border (or have "
+            f"less than {set_size} persistent gene families until the contig border). "
+            "Projection of spots stops here")
         return set()
 
     # remove node that were already in the graph
@@ -1048,7 +1050,7 @@ def predict_spot_in_one_organism(
                             filename='input_genome_rgp_to_spot.tsv', compress=compress)
 
     input_org_spots = {spot for spots in input_rgp_to_spots.values()
-                 for spot in spots }
+                       for spot in spots}
     new_spots = {spot for spot in input_org_spots if isinstance(spot, NewSpot)}
 
     logging.getLogger('PPanGGOLiN').debug(
@@ -1189,8 +1191,6 @@ def check_projection_arguments(args: argparse.Namespace, parser: argparse.Argume
     return input_mode
 
 
-
-
 def launch(args: argparse.Namespace):
     """
     Command launcher
@@ -1208,41 +1208,43 @@ def launch(args: argparse.Namespace):
 
     predict_rgp, project_spots, project_modules = check_pangenome_for_projection(pangenome, args.fast)
 
-    need_graph =True if args.table else False
+    need_graph = True if args.table else False
 
     check_pangenome_info(pangenome, need_annotations=True, need_families=True, disable_bar=args.disable_prog_bar,
                          need_rgp=predict_rgp, need_modules=project_modules, need_gene_sequences=False,
                          need_spots=project_spots, need_graph=need_graph)
-    
+
     logging.getLogger('PPanGGOLiN').info('Retrieving parameters from the provided pangenome file.')
     pangenome_params = argparse.Namespace(
         **{step: argparse.Namespace(**k_v) for step, k_v in pangenome.parameters.items()})
 
     if predict_rgp:
-        #computing multigenics for rgp prediction first to have original family.number_of_genomes 
+        # computing multigenics for rgp prediction first to have original family.number_of_genomes
         # and the same multigenics list as when rgp and spot were predicted
         multigenics = pangenome.get_multigenics(pangenome_params.rgp.dup_margin)
-    
-    organisms, genome_name_to_path, input_type = manage_input_genomes_annotation(pangenome=pangenome, 
-                                                                    input_mode=args.input_mode, 
-                                                                    anno=args.anno, fasta=args.fasta,
-                                                                    organism_name=args.genome_name, 
-                                                                    circular_contigs=args.circular_contigs, 
-                                                                    pangenome_params=pangenome_params,
-                                                                    cpu=args.cpu, use_pseudo=args.use_pseudo,
-                                                                    disable_bar=args.disable_prog_bar, 
-                                                                    tmpdir= args.tmpdir, config=args.config) 
-
-  
+
+    organisms, genome_name_to_path, input_type = manage_input_genomes_annotation(pangenome=pangenome,
+                                                                                 input_mode=args.input_mode,
+                                                                                 anno=args.anno, fasta=args.fasta,
+                                                                                 organism_name=args.genome_name,
+                                                                                 circular_contigs=args.circular_contigs,
+                                                                                 pangenome_params=pangenome_params,
+                                                                                 cpu=args.cpu,
+                                                                                 use_pseudo=args.use_pseudo,
+                                                                                 disable_bar=args.disable_prog_bar,
+                                                                                 tmpdir=args.tmpdir, config=args.config)
 
     input_org_to_lonely_genes_count = annotate_input_genes_with_pangenome_families(pangenome, input_organisms=organisms,
-                                                                                output=output_dir, cpu=args.cpu, use_representatives=args.fast,
-                                                                                no_defrag=args.no_defrag, identity=args.identity,
-                                                                                coverage=args.coverage, tmpdir=args.tmpdir,
-                                                                                translation_table=int(pangenome_params.cluster.translation_table),
-                                                                                keep_tmp=args.keep_tmp, 
-                                                                                disable_bar=args.disable_prog_bar)
-            
+                                                                                   output=output_dir, cpu=args.cpu,
+                                                                                   use_representatives=args.fast,
+                                                                                   no_defrag=args.no_defrag,
+                                                                                   identity=args.identity,
+                                                                                   coverage=args.coverage,
+                                                                                   tmpdir=args.tmpdir,
+                                                                                   translation_table=int(
+                                                                                       pangenome_params.cluster.translation_table),
+                                                                                   keep_tmp=args.keep_tmp,
+                                                                                   disable_bar=args.disable_prog_bar)
 
     input_org_2_rgps, input_org_to_spots, input_orgs_to_modules = {}, {}, {}
 
@@ -1250,38 +1252,41 @@ def launch(args: argparse.Namespace):
 
         logging.getLogger('PPanGGOLiN').info('Detecting RGPs in input genomes.')
 
-        input_org_2_rgps = predict_RGP(pangenome, organisms,  persistent_penalty=pangenome_params.rgp.persistent_penalty, variable_gain=pangenome_params.rgp.variable_gain,
-                                     min_length=pangenome_params.rgp.min_length, min_score=pangenome_params.rgp.min_score, multigenics=multigenics, output_dir=output_dir,
-                                     disable_bar=args.disable_prog_bar, compress=args.compress)
+        input_org_2_rgps = predict_RGP(pangenome, organisms, persistent_penalty=pangenome_params.rgp.persistent_penalty,
+                                       variable_gain=pangenome_params.rgp.variable_gain,
+                                       min_length=pangenome_params.rgp.min_length,
+                                       min_score=pangenome_params.rgp.min_score, multigenics=multigenics,
+                                       output_dir=output_dir,
+                                       disable_bar=args.disable_prog_bar, compress=args.compress)
 
         if project_spots:
             logging.getLogger('PPanGGOLiN').info('Predicting spot of insertion in input genomes.')
             input_org_to_spots = predict_spots_in_input_organisms(initial_spots=list(pangenome.spots),
-                                                            initial_regions=pangenome.regions,
-                                                            input_org_2_rgps=input_org_2_rgps,
-                                                            multigenics=multigenics,
-                                                            output=output_dir,
-                                                            write_graph_flag=args.spot_graph,
-                                                            graph_formats=args.graph_formats,
-                                                            overlapping_match=pangenome_params.spot.overlapping_match,
-                                                            set_size=pangenome_params.spot.set_size,
-                                                            exact_match=pangenome_params.spot.exact_match_size,
-                                                            compress=args.compress)
+                                                                  initial_regions=pangenome.regions,
+                                                                  input_org_2_rgps=input_org_2_rgps,
+                                                                  multigenics=multigenics,
+                                                                  output=output_dir,
+                                                                  write_graph_flag=args.spot_graph,
+                                                                  graph_formats=args.graph_formats,
+                                                                  overlapping_match=pangenome_params.spot.overlapping_match,
+                                                                  set_size=pangenome_params.spot.set_size,
+                                                                  exact_match=pangenome_params.spot.exact_match_size,
+                                                                  compress=args.compress)
 
     if project_modules:
         input_orgs_to_modules = project_and_write_modules(pangenome, organisms, output_dir, compress=args.compress)
 
     write_projection_results(pangenome, organisms, input_org_2_rgps,
-                            input_org_to_spots,
-                            input_orgs_to_modules,
-                            input_org_to_lonely_genes_count,
-                            write_proksee=args.proksee, write_gff=args.gff, write_table=args.table,
-                            add_sequences=args.add_sequences,
-                            genome_name_to_path=genome_name_to_path, input_type=input_type,
-                            output_dir=output_dir, dup_margin=args.dup_margin, soft_core=args.soft_core,
-                            metadata_sep=args.metadata_sep, compress=args.compress, 
-                            need_modules=project_modules, need_spots=project_spots, need_regions=predict_rgp)
-    
+                             input_org_to_spots,
+                             input_orgs_to_modules,
+                             input_org_to_lonely_genes_count,
+                             write_proksee=args.proksee, write_gff=args.gff, write_table=args.table,
+                             add_sequences=args.add_sequences,
+                             genome_name_to_path=genome_name_to_path, input_type=input_type,
+                             output_dir=output_dir, dup_margin=args.dup_margin, soft_core=args.soft_core,
+                             metadata_sep=args.metadata_sep, compress=args.compress,
+                             need_modules=project_modules, need_spots=project_spots, need_regions=predict_rgp)
+
 
 def subparser(sub_parser: argparse._SubParsersAction) -> argparse.ArgumentParser:
     """
@@ -1355,12 +1360,12 @@ def parser_projection(parser: argparse.ArgumentParser):
                           help="minimum ratio of genomes in which the family must have multiple genes "
                                "for it to be considered 'duplicated'. "
                                "This metric is used to compute completeness and duplication of the input genomes")
-    
+
     optional.add_argument("--soft_core", required=False, type=restricted_float, default=0.95,
-                      help="Soft core threshold used when generating general statistics on the projected genome. "
-                           "This threshold does not influence PPanGGOLiN's partitioning. "
-                           "The value determines the minimum fraction of genomes that must possess a gene family "
-                           "for it to be considered part of the soft core.")
+                          help="Soft core threshold used when generating general statistics on the projected genome. "
+                               "This threshold does not influence PPanGGOLiN's partitioning. "
+                               "The value determines the minimum fraction of genomes that must possess a gene family "
+                               "for it to be considered part of the soft core.")
 
     optional.add_argument("--spot_graph", required=False, action="store_true",
                           help="Write the spot graph to a file, with pairs of blocks of single copy markers flanking RGPs "
@@ -1374,13 +1379,13 @@ def parser_projection(parser: argparse.ArgumentParser):
 
     optional.add_argument("--proksee", required=False, action="store_true",
                           help="Generate JSON map files for PROKSEE with projected pangenome annotations for each input genome.")
-    
+
     optional.add_argument("--table", required=False, action="store_true",
                           help="Generate a tsv file for each input genome with pangenome annotations.")
-    
+
     optional.add_argument("--compress", required=False, action="store_true",
                           help="Compress the files in .gz")
-    
+
     optional.add_argument("--add_sequences", required=False, action="store_true",
                           help="Include input genome DNA sequences in GFF and Proksee output.")
 
@@ -1392,21 +1397,21 @@ def parser_projection(parser: argparse.ArgumentParser):
 
     optional.add_argument("--keep_tmp", required=False, default=False, action="store_true",
                           help="Keeping temporary files (useful for debugging).")
-    
+
     optional.add_argument("--add_metadata",
-                            required=False,
-                            action="store_true",
-                            help="Include metadata information in the output files "
-                                "if any have been added to pangenome elements (see ppanggolin metadata command).")
+                          required=False,
+                          action="store_true",
+                          help="Include metadata information in the output files "
+                               "if any have been added to pangenome elements (see ppanggolin metadata command).")
 
     optional.add_argument("--metadata_sources",
-                            default=None,
-                            nargs="+",
-                            help="Which source of metadata should be written. "
-                                "By default all metadata sources are included.")
+                          default=None,
+                          nargs="+",
+                          help="Which source of metadata should be written. "
+                               "By default all metadata sources are included.")
 
     optional.add_argument("--metadata_sep",
-                            required=False,
-                            default='|',
-                            help="The separator used to join multiple metadata values for elements with multiple metadata"
-                                " values from the same source. This character should not appear in metadata values.")
+                          required=False,
+                          default='|',
+                          help="The separator used to join multiple metadata values for elements with multiple metadata"
+                               " values from the same source. This character should not appear in metadata values.")

From 1516f124eaa929454728c99e29c645dc9e4d7823 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= <arnoux.jeromepj@gmail.com>
Date: Fri, 7 Jun 2024 15:28:31 +0200
Subject: [PATCH 21/25] Improve GitHub workflow

---
 .github/workflows/main.yml | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index 2abe5b7f..88088f66 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -95,12 +95,12 @@ jobs:
         ppanggolin write_pangenome -p stepbystep/pangenome.h5 --output stepbystep -f --soft_core 0.9 --dup_margin 0.06  --gexf --light_gexf --csv --Rtab --stats --partitions --compress --json --spots --regions --borders --families_tsv --cpu 1 
         ppanggolin write_genomes  -p stepbystep/pangenome.h5 --output stepbystep -f --fasta genomes.fasta.list --gff --proksee --table
         ppanggolin fasta -p stepbystep/pangenome.h5 --output stepbystep -f --prot_families all --gene_families shell --regions all --fasta genomes.fasta.list
-        ppanggolin fasta -p stepbystep/pangenome.h5 --output stepbystep -f --prot_families rgp --gene_families rgp 
+        ppanggolin fasta -p stepbystep/pangenome.h5 --output stepbystep -f --prot_families rgp --gene_families rgp --compress 
         ppanggolin fasta -p stepbystep/pangenome.h5 --output stepbystep -f --prot_families softcore --gene_families softcore 
         ppanggolin fasta -p stepbystep/pangenome.h5 --output stepbystep -f --prot_families module_0
-        ppanggolin fasta -p stepbystep/pangenome.h5 --output stepbystep -f --prot_families core
-        ppanggolin fasta -p stepbystep/pangenome.h5 --output stepbystep -f --gene_families module_0 --genes module_0
-        ppanggolin fasta -p stepbystep/pangenome.h5 --output stepbystep -f --proteins cloud --cpu $NUM_CPUS --keep_tmp 
+        ppanggolin fasta -p stepbystep/pangenome.h5 --output stepbystep -f --genes core --proteins cloud
+        ppanggolin fasta -p stepbystep/pangenome.h5 --output stepbystep -f --gene_families module_0 --genes module_0 --compress
+        ppanggolin fasta -p stepbystep/pangenome.h5 --output stepbystep -f --proteins cloud --cpu $NUM_CPUS --keep_tmp --compress
 
         ppanggolin draw -p stepbystep/pangenome.h5 --draw_spots --spots all -o stepbystep -f
         ppanggolin metrics -p stepbystep/pangenome.h5 --genome_fluidity --no_print_info --recompute_metrics --log metrics.log
@@ -180,6 +180,8 @@ jobs:
         head genomes.gbff.list | sed 's/^/input_genome_/g' > genomes.gbff.head.list
         ppanggolin projection --pangenome stepbystep/pangenome.h5  -o projection_from_list_of_gbff --anno genomes.gbff.head.list --gff --proksee --cpu $NUM_CPUS
 
+        head genomes.fasta.list | sed 's/^/input_genome_/g' > genomes.fasta.head.list
+        ppanggolin projection --pangenome myannopang/pangenome.h5  -o projection_from_list_of_fasta --fasta genomes.fasta.head.list --gff --proksee --cpu $NUM_CPUS
 
         ppanggolin projection --pangenome mybasicpangenome/pangenome.h5  -o projection_from_single_fasta \
                               --genome_name chlam_A --fasta FASTA/GCF_002776845.1_ASM277684v1_genomic.fna.gz \

From e909a6525778464562a5cecbec46c3c71124c326 Mon Sep 17 00:00:00 2001
From: JeanMainguy <jean.mainguy@outlook.fr>
Date: Mon, 10 Jun 2024 16:14:46 +0200
Subject: [PATCH 22/25] simplify unit test

---
 pyproject.toml            |   6 --
 tests/align/test_align.py | 159 ++++++++++++++++----------------------
 2 files changed, 65 insertions(+), 100 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index abdbf2d7..7864f533 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -81,9 +81,3 @@ packages = ["ppanggolin"]
 
 [tool.setuptools.dynamic]
 version = {file = "VERSION"}
-
-[tool.pytest.ini_options]
-markers = [
-    "nucleotides: marks tests to run with nucleotides alphabets",
-    "aminoacids: marks tests to run with amino-acids alphabets",
-]
\ No newline at end of file
diff --git a/tests/align/test_align.py b/tests/align/test_align.py
index e5ce2dde..42d1cd5f 100644
--- a/tests/align/test_align.py
+++ b/tests/align/test_align.py
@@ -6,106 +6,77 @@
 
 
 @pytest.fixture
-def nucleotides():
-    yield ['A', 'C', 'G', 'T', 'N']
+def single_line_fasta_nt() -> List:
+            
+    return ['>Gene_1 seq_description\n',
+            'ATGCGTTGTCGTTG\n',
+            ">Gene_2\n",
+            "TGTGACCTGCT\n"
+            ]
 
 @pytest.fixture
-def aminoacids():
-    amino_acid_alphabet = [
-        'A',  # Alanine
-        'R',  # Arginine
-        'N',  # Asparagine
-        'D',  # Aspartic acid
-        'C',  # Cysteine
-        'E',  # Glutamic acid
-        'Q',  # Glutamine
-        'G',  # Glycine
-        'H',  # Histidine
-        'I',  # Isoleucine
-        'L',  # Leucine
-        'K',  # Lysine
-        'M',  # Methionine
-        'F',  # Phenylalanine
-        'P',  # Proline
-        'S',  # Serine
-        'T',  # Threonine
-        'W',  # Tryptophan
-        'Y',  # Tyrosine
-        'V'  # Valine
-    ]
-    yield amino_acid_alphabet
+def single_line_fasta_aa() -> List:
+            
+    return ['>Gene_1 seq_description\n',
+            'YWTPRPFFYAAEYNN\n',
+            ">Gene_2\n",
+            "YWTPRPSYWTPAAEYNN\n"
+            ]
 
 @pytest.fixture
-def number_of_sequences():
-    yield randint(4, 10)
-@pytest.fixture
-def single_line_fasta(request, tmp_path_factory: pytest.TempPathFactory, number_of_sequences,
-                      aminoacids: List[str], nucleotides: List[str]):
-    if request.node.get_closest_marker('aminoacids'):
-        alphabet = aminoacids
-        fasta_path = tmp_path_factory.getbasetemp() / "single_line_nt.fasta"
-    else:
-        alphabet = nucleotides
-        fasta_path = tmp_path_factory.getbasetemp() / "single_line_aa.fasta"
-    with open(fasta_path, "w") as fasta:
-        for i in range(number_of_sequences):
-            fasta.write(f">Gene_{i}\n")
-            fasta.write("".join([choice(alphabet) for _ in range(0, randint(30, 100))]))
-            fasta.write("\n")
-    yield fasta_path
+def multi_line_fasta_nt() -> List:
+            
+    return ['>Gene_1 seq_description\n',
+            'ATGCGT\n',
+            'TGTCGTTG\n',
+            ">Gene_2\n",
+            "TGTGACCTGCT\n"
+            ]
 
 @pytest.fixture
-def multi_line_fasta(request, tmp_path_factory: pytest.TempPathFactory, number_of_sequences,
-                     aminoacids: List[str], nucleotides: List[str]):
-    if request.node.get_closest_marker('aminoacids'):
-        alphabet = aminoacids
-        fasta_path = tmp_path_factory.getbasetemp() / "single_line_nt.fasta"
-    else:
-        alphabet = nucleotides
-        fasta_path = tmp_path_factory.getbasetemp() / "single_line_aa.fasta"
-    with open(fasta_path, "w") as fasta:
-        for i in range(number_of_sequences):
-            fasta.write(f">Gene_{i}\n")
-            for j in range(randint(4,10)):
-                fasta.write("".join([choice(alphabet) for _ in range(60)]))
-                fasta.write("\n")
-    yield fasta_path
-
-@pytest.mark.nucleotides
-def test_get_seq_ids_single_line_nt(number_of_sequences, single_line_fasta):
-    with open(single_line_fasta, "r") as fasta:
-        seq_set, is_nucleotide, single_line_fasta = get_seq_ids(fasta)
-        assert len(seq_set) == number_of_sequences
-        assert seq_set == {f"Gene_{i}" for i in range(number_of_sequences)}
-        assert is_nucleotide
-        assert single_line_fasta
-
-@pytest.mark.aminoacids
-def test_get_seq_ids_single_line_aa(number_of_sequences, single_line_fasta):
-    with open(single_line_fasta, "r") as fasta:
-        seq_set, is_nucleotide, single_line_fasta = get_seq_ids(fasta)
-        assert len(seq_set) == number_of_sequences
-        assert seq_set == {f"Gene_{i}" for i in range(number_of_sequences)}
-        assert not is_nucleotide
-        assert single_line_fasta
-
-@pytest.mark.nucleotides
-def test_get_seq_ids_multi_line_nt(number_of_sequences, multi_line_fasta):
-    with open(multi_line_fasta, "r") as fasta:
-        seq_set, is_nucleotide, single_line_fasta = get_seq_ids(fasta)
-        assert len(seq_set) == number_of_sequences
-        assert seq_set == {f"Gene_{i}" for i in range(number_of_sequences)}
-        assert is_nucleotide
-        assert not single_line_fasta
-
-@pytest.mark.aminoacids
-def test_get_seq_ids_multi_line_aa(number_of_sequences, multi_line_fasta):
-    with open(multi_line_fasta, "r") as fasta:
-        seq_set, is_nucleotide, single_line_fasta = get_seq_ids(fasta)
-        assert len(seq_set) == number_of_sequences
-        assert seq_set == {f"Gene_{i}" for i in range(number_of_sequences)}
-        assert not is_nucleotide
-        assert not single_line_fasta
+def multi_line_fasta_aa() -> List:
+            
+    return ['>Gene_1 seq_description\n',
+            'AAEYNN\n',
+            'YWTPRPFFY\n',
+            ">Gene_2\n",
+            "YWTPRPS\n",
+            "YWTPAAEYNN\n"
+            ]
+
+
+def test_get_seq_ids_single_line_nt(single_line_fasta_nt):
+    
+    
+    seq_set, is_nucleotide, single_line_fasta = get_seq_ids(single_line_fasta_nt)
+    assert len(seq_set) == 2
+    assert seq_set == {'Gene_1', 'Gene_2'}
+    assert is_nucleotide
+    assert single_line_fasta
+
+def test_get_seq_ids_single_line_aa(single_line_fasta_aa):
+    seq_set, is_nucleotide, single_line_fasta = get_seq_ids(single_line_fasta_aa)
+    assert len(seq_set) == 2
+    assert seq_set == {'Gene_1', 'Gene_2'}
+    assert not is_nucleotide
+    assert single_line_fasta
+
+def test_get_seq_ids_multi_line_nt(multi_line_fasta_nt):
+
+    seq_set, is_nucleotide, single_line_fasta = get_seq_ids(multi_line_fasta_nt)
+
+    assert len(seq_set) == 2
+    assert seq_set == {'Gene_1', 'Gene_2'}
+    assert is_nucleotide
+    assert not single_line_fasta
+
+def test_get_seq_ids_multi_line_aa(multi_line_fasta_aa):
+
+    seq_set, is_nucleotide, single_line_fasta = get_seq_ids(multi_line_fasta_aa)
+    assert len(seq_set) == 2
+    assert seq_set == {'Gene_1', 'Gene_2'}
+    assert not is_nucleotide
+    assert not single_line_fasta
 
 
 

From 0e69f70f7dc79c4a47e080b4337bbd9a75e90152 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= <arnoux.jeromepj@gmail.com>
Date: Mon, 10 Jun 2024 16:39:48 +0200
Subject: [PATCH 23/25] Solve requested change

---
 ppanggolin/cluster/cluster.py        | 1 -
 ppanggolin/formats/writeSequences.py | 2 +-
 tests/pytest.ini                     | 4 ++++
 3 files changed, 5 insertions(+), 2 deletions(-)
 create mode 100644 tests/pytest.ini

diff --git a/ppanggolin/cluster/cluster.py b/ppanggolin/cluster/cluster.py
index faab8873..e11793ec 100644
--- a/ppanggolin/cluster/cluster.py
+++ b/ppanggolin/cluster/cluster.py
@@ -501,7 +501,6 @@ def parser_clust(parser: argparse.ArgumentParser):
                             "with their original gene family.")
     clust.add_argument("--translation_table", required=False, default="11",
                        help="Translation table (genetic code) to use.")
-    # clust.add_argument("--compress")
 
     clust.add_argument("-c", "--cpu", required=False, default=1, type=int, help="Number of available cpus")
 
diff --git a/ppanggolin/formats/writeSequences.py b/ppanggolin/formats/writeSequences.py
index 272e1a8a..5f188f99 100644
--- a/ppanggolin/formats/writeSequences.py
+++ b/ppanggolin/formats/writeSequences.py
@@ -501,7 +501,7 @@ def write_regions_sequences(pangenome: Pangenome, output: Path, regions: str, fa
     for line in read_compressed_or_not(organisms_file):
         elements = [el.strip() for el in line.split("\t")]
         if len(elements) <= 1:
-            raise SyntaxError(f"No tabulation separator found in given --fasta or --anno file: '{organisms_file}'")
+            raise ValueError(f"No tabulation separator found in given --fasta or --anno file: '{organisms_file}'")
         org_dict[elements[0]] = Path(elements[1])
         if not org_dict[elements[0]].exists():  # Check tsv sanity test if it's not one it's the other
             org_dict[elements[0]] = organisms_file.parent.joinpath(org_dict[elements[0]])
diff --git a/tests/pytest.ini b/tests/pytest.ini
new file mode 100644
index 00000000..393e1386
--- /dev/null
+++ b/tests/pytest.ini
@@ -0,0 +1,4 @@
+[pytest]
+markers =
+	nucleotides: marks tests to run with nucleotides alphabets
+	aminoacids: marks tests to run with amino-acids alphabets
\ No newline at end of file

From 93a4b01543d456c0f8ab8883ece6a928173adeb9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= <arnoux.jeromepj@gmail.com>
Date: Mon, 10 Jun 2024 17:30:41 +0200
Subject: [PATCH 24/25] Make alignment with MMSeqs2 more flexible.

---
 ppanggolin/align/alignOnPang.py         | 46 +++++++++++++------------
 ppanggolin/context/searchGeneContext.py | 11 +++---
 ppanggolin/projection/projection.py     |  9 +++--
 3 files changed, 33 insertions(+), 33 deletions(-)

diff --git a/ppanggolin/align/alignOnPang.py b/ppanggolin/align/alignOnPang.py
index 2b5547d3..ca72ed9b 100644
--- a/ppanggolin/align/alignOnPang.py
+++ b/ppanggolin/align/alignOnPang.py
@@ -27,7 +27,7 @@
 
 def align_seq_to_pang(target_seq_file: Union[Path, Iterable[Path]], query_seq_files: Union[Path, Iterable[Path]],
                       tmpdir: Path, cpu: int = 1, no_defrag: bool = False, identity: float = 0.8, coverage: float = 0.8,
-                      is_query_nt: bool = False, is_query_slf: bool = False, is_target_nt: bool = False,
+                      query_type: str = "unknow", is_query_slf: bool = False, target_type: str = "unknow",
                       is_target_slf: bool = False, translation_table: int = None) -> Path:
     """
     Align fasta sequence to pangenome sequences. 
@@ -39,34 +39,36 @@ def align_seq_to_pang(target_seq_file: Union[Path, Iterable[Path]], query_seq_fi
     :param no_defrag: Do not apply defragmentation
     :param identity: minimal identity threshold for the alignment
     :param coverage: minimal identity threshold for the alignment
-    :param is_query_nt: Is the sequence file (query) are nucleotide sequences. If True, sequences are translated by mmseqs
+    :param query_type: Sequences type of the file (query). [nucleotide, protein, unknow]
     :param is_query_slf: Is the sequence file (query) with single line fasta. If True, MMSeqs2 database will be with soft link
-    :param is_target_nt: Is the sequences of pangenome (target) are nucleotide sequences. If True, sequences are translated by mmseqs
+    :param target_type: Sequences type of pangenome (target). [nucleotide, aminoacid, protein]
     :param is_target_slf: Is the sequences of pangenome (target) with single line fasta. If True, MMSeqs2 database will be with soft link
     :param translation_table: Translation table to use, if sequences are nucleotide and need to be translated.
 
     :return: Alignment result file
     """
 
-    if is_target_nt:
+    if target_type == "nucleotide":
         logging.getLogger("PPanGGOLiN").debug("Target sequences will be translated by mmseqs with "
                                               f"translation table {translation_table}")
         with create_tmpdir(tmpdir, basename="target_db", keep_tmp=True) as target_db_dir:
             #  Keep is set as true because whether tmpdir is deleted or not target_db_dir will be the same
             target_db = translate_genes(target_seq_file, target_db_dir, cpu, is_target_slf, translation_table)
     else:
+        db_type = 1 if target_type == "protein" else 0
         target_db = create_mmseqs_db([target_seq_file] if isinstance(target_seq_file, Path) else target_seq_file,
-                                     'target_db', tmpdir, db_mode=1 if is_target_slf else 0, db_type=1)
+                                     'target_db', tmpdir, db_mode=1 if is_target_slf else 0, db_type=db_type)
 
-    if is_query_nt:
+    if query_type == "nucleotide":
         logging.getLogger("PPanGGOLiN").debug("Query sequences will be translated by mmseqs "
                                               f"with translation table {translation_table}")
         with create_tmpdir(tmpdir, basename="query_db", keep_tmp=True) as query_db_dir:
             #  Keep is set as true because whether tmpdir is deleted or not target_db_dir will be the same
             query_db = translate_genes(query_seq_files, query_db_dir, cpu, is_query_slf, translation_table)
     else:
+        db_type = 1 if query_type == "protein" else 0
         query_db = create_mmseqs_db([query_seq_files] if isinstance(query_seq_files, Path) else query_seq_files,
-                                    'query_db', tmpdir, db_mode=1 if is_query_slf else 0, db_type=1)
+                                    'query_db', tmpdir, db_mode=1 if is_query_slf else 0, db_type=db_type)
 
     cov_mode = "2"  # coverage of query
     if no_defrag:
@@ -393,7 +395,7 @@ def get_seq_info(seq_to_pang: dict, pangenome: Pangenome, output: Path, draw_rel
 
 
 def get_input_seq_to_family_with_rep(pangenome: Pangenome, sequence_files: Union[Path, Iterable[Path]], output: Path,
-                                     tmpdir: Path, is_input_seq_nt: bool, is_input_slf: bool = False, cpu: int = 1,
+                                     tmpdir: Path, input_type: str = "unknow", is_input_slf: bool = False, cpu: int = 1,
                                      no_defrag: bool = False, identity: float = 0.8, coverage: float = 0.8,
                                      translation_table: int = 11, disable_bar: bool = False
                                      ) -> Tuple[Path, Dict[str, GeneFamily]]:
@@ -407,7 +409,7 @@ def get_input_seq_to_family_with_rep(pangenome: Pangenome, sequence_files: Union
     :param sequence_files: Iterable of paths of FASTA files containing input sequences to align.
     :param output: Path to the output directory where alignment results will be stored.
     :param tmpdir: Temporary directory for intermediate files.
-    :param is_input_seq_nt: Is input sequence file nucleotide sequences.
+    :param input_type: Type of input sequence file. [nucleotide, aminoacid, unknow]
     :param is_input_slf: Is the sequence file with single line fasta. If True, MMSeqs2 database will be with soft link
     :param cpu: Number of CPU cores to use for the alignment (default: 1).
     :param no_defrag: If True, the defragmentation workflow is skipped (default: False).
@@ -424,10 +426,10 @@ def get_input_seq_to_family_with_rep(pangenome: Pangenome, sequence_files: Union
     logging.getLogger("PPanGGOLiN").debug(f'Write gene family sequences in {pangenome_sequences.absolute()}')
     write_gene_fam_sequences(pangenome, pangenome_sequences, add="ppanggolin_", disable_bar=disable_bar)
 
-    align_file = align_seq_to_pang(target_seq_file=pangenome_sequences, query_seq_files=sequence_files,
-                                   tmpdir=tmpdir, cpu=cpu, no_defrag=no_defrag, identity=identity, coverage=coverage,
-                                   is_query_nt=is_input_seq_nt, is_query_slf=is_input_slf,
-                                   is_target_nt=False, is_target_slf=True, translation_table=translation_table)
+    align_file = align_seq_to_pang(target_seq_file=pangenome_sequences, query_seq_files=sequence_files, tmpdir=tmpdir,
+                                   cpu=cpu, no_defrag=no_defrag, identity=identity, coverage=coverage,
+                                   query_type=input_type, is_query_slf=is_input_slf, is_target_slf=True,
+                                   target_type="protein", translation_table=translation_table)
 
     seq2pang, align_file = map_input_gene_to_family_rep_aln(align_file, output, pangenome)
 
@@ -435,7 +437,7 @@ def get_input_seq_to_family_with_rep(pangenome: Pangenome, sequence_files: Union
 
 
 def get_input_seq_to_family_with_all(pangenome: Pangenome, sequence_files:  Union[Path, Iterable[Path]], output: Path,
-                                     tmpdir: Path, is_input_seq_nt: bool, is_input_slf: bool = False, cpu: int = 1,
+                                     tmpdir: Path, input_type: str = "unknow", is_input_slf: bool = False, cpu: int = 1,
                                      no_defrag: bool = False, identity: float = 0.8, coverage: float = 0.8,
                                      translation_table: int = 11, disable_bar: bool = False
                                      ) -> Tuple[Path, Dict[str, GeneFamily]]:
@@ -449,7 +451,7 @@ def get_input_seq_to_family_with_all(pangenome: Pangenome, sequence_files:  Unio
     :param sequence_files: Iterable of paths of FASTA files containing input sequences to align.
     :param output: Path to the output directory where alignment results will be stored.
     :param tmpdir: Temporary directory for intermediate files.
-    :param is_input_seq_nt: Is input sequence file nucleotide sequences.
+    :param input_type: Sequences type of the file (query). [nucleotide, protein, unknow]
     :param is_input_slf: Is the sequence file with single line fasta. If True, MMSeqs2 database will be with soft link
     :param cpu: Number of CPU cores to use for the alignment (default: 1).
     :param no_defrag: If True, the defragmentation workflow is skipped (default: False).
@@ -467,8 +469,8 @@ def get_input_seq_to_family_with_all(pangenome: Pangenome, sequence_files:  Unio
 
     align_file = align_seq_to_pang(target_seq_file=pangenome_sequences, query_seq_files=sequence_files, tmpdir=tmpdir,
                                    cpu=cpu, no_defrag=no_defrag, identity=identity, coverage=coverage,
-                                   is_query_nt=is_input_seq_nt, is_query_slf=is_input_slf, is_target_nt=True,
-                                   is_target_slf=True, translation_table=translation_table)
+                                   query_type=input_type, is_query_slf=is_input_slf, is_target_slf=True,
+                                   target_type="nucleotide", translation_table=translation_table)
 
     seq2pang, align_file = map_input_gene_to_family_all_aln(align_file, output, pangenome)
 
@@ -519,19 +521,19 @@ def align(pangenome: Pangenome, sequence_file: Path, output: Path, identity: flo
         seq_set, is_nucleotide, single_line_fasta = get_seq_ids(seqFileObj)
 
     with create_tmpdir(main_dir=tmpdir, basename="align_input_seq_tmpdir", keep_tmp=keep_tmp) as new_tmpdir:
-
+        input_type = "nucleotide" if is_nucleotide else "unknow"
         if use_representatives:
             align_file, seq2pang = get_input_seq_to_family_with_rep(pangenome, sequence_file, output, new_tmpdir,
-                                                                    is_input_seq_nt=is_nucleotide,
-                                                                    is_input_slf=single_line_fasta,
-                                                                    cpu=cpu, no_defrag=no_defrag, identity=identity,
+                                                                    input_type=input_type,
+                                                                    is_input_slf=single_line_fasta, cpu=cpu,
+                                                                    no_defrag=no_defrag, identity=identity,
                                                                     coverage=coverage,
                                                                     translation_table=translation_table,
                                                                     disable_bar=disable_bar)
         else:
             align_file, seq2pang = get_input_seq_to_family_with_all(pangenome=pangenome, sequence_files=sequence_file,
                                                                     output=output, tmpdir=new_tmpdir,
-                                                                    is_input_seq_nt=is_nucleotide,
+                                                                    input_type=input_type,
                                                                     is_input_slf=single_line_fasta,
                                                                     cpu=cpu, no_defrag=no_defrag, identity=identity,
                                                                     coverage=coverage,
diff --git a/ppanggolin/context/searchGeneContext.py b/ppanggolin/context/searchGeneContext.py
index 508203fc..512d12a6 100644
--- a/ppanggolin/context/searchGeneContext.py
+++ b/ppanggolin/context/searchGeneContext.py
@@ -74,18 +74,17 @@ def search_gene_context_in_pangenome(pangenome: Pangenome, output: Path, tmpdir:
         logging.debug(f"Input sequences are {'nucleotide' if is_nucleotide else 'protein'} sequences")
 
         with create_tmpdir(main_dir=tmpdir, basename="align_input_seq_tmpdir", keep_tmp=keep_tmp) as new_tmpdir:
-
+            input_type = "nucleotide" if is_nucleotide else "unknow"
             if use_representatives:
                 _, seqid2fam = get_input_seq_to_family_with_rep(pangenome, sequence_file, output, new_tmpdir,
-                                                                is_input_seq_nt=is_nucleotide, is_input_slf=is_slf,
-                                                                cpu=cpu, no_defrag=no_defrag,
-                                                                identity=identity, coverage=coverage,
-                                                                translation_table=translation_table,
+                                                                input_type=input_type, is_input_slf=is_slf, cpu=cpu,
+                                                                no_defrag=no_defrag, identity=identity,
+                                                                coverage=coverage, translation_table=translation_table,
                                                                 disable_bar=disable_bar)
             else:
                 _, seqid2fam = get_input_seq_to_family_with_all(pangenome=pangenome, sequence_files=sequence_file,
                                                                 output=output, tmpdir=new_tmpdir,
-                                                                is_input_seq_nt=is_nucleotide, is_input_slf=is_slf,
+                                                                input_type=input_type, is_input_slf=is_slf,
                                                                 cpu=cpu, no_defrag=no_defrag,
                                                                 identity=identity, coverage=coverage,
                                                                 translation_table=translation_table,
diff --git a/ppanggolin/projection/projection.py b/ppanggolin/projection/projection.py
index 08c0da5f..78cfc0d3 100644
--- a/ppanggolin/projection/projection.py
+++ b/ppanggolin/projection/projection.py
@@ -558,17 +558,16 @@ def annotate_input_genes_with_pangenome_families(pangenome: Pangenome, input_org
     with create_tmpdir(main_dir=tmpdir, basename="projection_tmp", keep_tmp=keep_tmp) as new_tmpdir:
         if use_representatives:
             _, seqid_to_gene_family = get_input_seq_to_family_with_rep(pangenome=pangenome,
-                                                                       sequence_files=seq_fasta_file,
-                                                                       output=new_tmpdir, tmpdir=new_tmpdir,
-                                                                       is_input_seq_nt=True, is_input_slf=True,
-                                                                       cpu=cpu, no_defrag=no_defrag,
+                                                                       sequence_files=seq_fasta_file, output=new_tmpdir,
+                                                                       tmpdir=new_tmpdir, input_type="nucleotide",
+                                                                       is_input_slf=True, cpu=cpu, no_defrag=no_defrag,
                                                                        identity=identity, coverage=coverage,
                                                                        translation_table=translation_table)
         else:
             _, seqid_to_gene_family = get_input_seq_to_family_with_all(pangenome=pangenome,
                                                                        sequence_files=seq_fasta_file,
                                                                        output=new_tmpdir, tmpdir=new_tmpdir,
-                                                                       is_input_seq_nt=True, is_input_slf=True,
+                                                                       input_type="nucleotide", is_input_slf=True,
                                                                        cpu=cpu, no_defrag=no_defrag, identity=identity,
                                                                        coverage=coverage,
                                                                        translation_table=translation_table,

From f30ab344592a070116dd842912e9239993e9a53e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= <arnoux.jeromepj@gmail.com>
Date: Mon, 10 Jun 2024 17:31:31 +0200
Subject: [PATCH 25/25] clean old code

---
 ppanggolin/projection/projection.py | 11 -----------
 1 file changed, 11 deletions(-)

diff --git a/ppanggolin/projection/projection.py b/ppanggolin/projection/projection.py
index 78cfc0d3..8a734201 100644
--- a/ppanggolin/projection/projection.py
+++ b/ppanggolin/projection/projection.py
@@ -536,21 +536,10 @@ def annotate_input_genes_with_pangenome_families(pangenome: Pangenome, input_org
 
     :return: Number of genes that do not cluster with any of the gene families of the pangenome.
     """
-    seq_fasta_files = []
-
     logging.getLogger('PPanGGOLiN').info('Writing gene sequences of input genomes.')
 
     input_genes = [gene for org in input_organisms for gene in org.genes]
 
-    # for input_organism in input_organisms:
-    #     seq_outdir = output / input_organism.name
-    #     mk_outdir(seq_outdir, force=True)
-    #
-    #     seq_fasta_file = seq_outdir / "cds_sequences.fasta"
-    #
-    #     write_gene_sequences_from_annotations(input_organism.genes, seq_fasta_file, disable_bar=True, add="ppanggolin_")
-    #
-    #     seq_fasta_files.append(seq_fasta_file)
     seq_fasta_file = output / 'input_genes.fasta'
 
     write_gene_sequences_from_annotations(input_genes, seq_fasta_file, disable_bar=True, add='ppanggolin_')