replace organism by genome

labgem · Dec 4, 2023 · c19a600 · c19a600
1 parent 389926e
commit c19a600
Show file tree

Hide file tree

Showing 36 changed files with 180 additions and 179 deletions.
diff --git a/.github/workflows/check_recipes.yml b/.github/workflows/check_recipes.yml
@@ -42,5 +42,5 @@ jobs:
         shell: bash -l {0}
         run: |
           cd testingDataset/
-          ppanggolin all --cpu 1 --anno organisms.gbff.list -o pango
+          ppanggolin all --cpu 1 --anno genomes.gbff.list -o pango
           ppanggolin info -p pango/pangenome.h5 --content --parameters --status
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -50,7 +50,7 @@ jobs:
       shell: bash -l {0}
       run: |
         cd testingDataset
-        ppanggolin all --cpu 1 --fasta organisms.fasta.list --output mybasicpangenome
+        ppanggolin all --cpu 1 --fasta genomes.fasta.list --output mybasicpangenome
         ppanggolin info --pangenome mybasicpangenome/pangenome.h5 --content --parameters --status
         cd -
     # test most options calls. If there is a change in the API somewhere that was not taken into account (whether in the options for the users, or the classes for the devs), this should fail, otherwise everything is probably good.
@@ -59,7 +59,7 @@ jobs:
       shell: bash -l {0}
       run: |
         cd testingDataset
-        ppanggolin annotate --fasta organisms.fasta.list --output stepbystep --kingdom bacteria
+        ppanggolin annotate --fasta genomes.fasta.list --output stepbystep --kingdom bacteria
         ppanggolin cluster -p stepbystep/pangenome.h5 --coverage 0.8 --identity 0.8
         ppanggolin graph -p stepbystep/pangenome.h5 -r 10
         ppanggolin partition --output stepbystep -f -p stepbystep/pangenome.h5 --cpu 1 -b 2.6 -ms 10 -fd -ck 500 -Kmm 3 12 -im 0.04 --draw_ICL -se $RANDOM
@@ -70,24 +70,24 @@ jobs:
         ppanggolin draw -p stepbystep/pangenome.h5 --draw_spots -o stepbystep -f
         ppanggolin module -p stepbystep/pangenome.h5 --transitive 4 --size 3 --jaccard 0.86 --dup_margin 0.05
         ppanggolin write_pangenome -p stepbystep/pangenome.h5 --output stepbystep -f --soft_core 0.9 --dup_margin 0.06  --gexf --light_gexf --csv --Rtab --stats --partitions --compress --json --spots --borders --families_tsv --cpu 1 
-        ppanggolin write_genomes  -p stepbystep/pangenome.h5 --output stepbystep -f --fasta organisms.fasta.list --gff --proksee --table
-        ppanggolin fasta -p stepbystep/pangenome.h5 --output stepbystep -f --prot_families all --gene_families shell --regions all --fasta organisms.fasta.list
+        ppanggolin write_genomes  -p stepbystep/pangenome.h5 --output stepbystep -f --fasta genomes.fasta.list --gff --proksee --table
+        ppanggolin fasta -p stepbystep/pangenome.h5 --output stepbystep -f --prot_families all --gene_families shell --regions all --fasta genomes.fasta.list
         ppanggolin draw -p stepbystep/pangenome.h5 --draw_spots --spots all -o stepbystep -f
         ppanggolin metrics -p stepbystep/pangenome.h5 --genome_fluidity --info_modules --no_print_info -f --log metrics.log
         cd - 
     - name: gbff parsing and MSA computing
       shell: bash -l {0}
       run: |
         cd testingDataset
-        ppanggolin workflow --cpu 1 --anno organisms.gbff.list --output myannopang
+        ppanggolin workflow --cpu 1 --anno genomes.gbff.list --output myannopang
         ppanggolin msa --pangenome myannopang/pangenome.h5 --source dna --partition core -o myannopang/ -f --use_gene_id --phylo --single_copy
         cd -
     - name: clusters reading from external file
       shell: bash -l {0}
       run: |
         cd testingDataset
-        ppanggolin panrgp --anno organisms.gbff.list --cluster clusters.tsv --output readclusterpang
-        ppanggolin annotate --anno organisms.gbff.list --output readclusters
+        ppanggolin panrgp --anno genomes.gbff.list --cluster clusters.tsv --output readclusterpang
+        ppanggolin annotate --anno genomes.gbff.list --output readclusters
         ppanggolin cluster --clusters clusters.tsv -p readclusters/pangenome.h5
         ppanggolin msa --pangenome readclusterpang/pangenome.h5 --partition persistent --phylo -o readclusterpang/msa/ -f
         cd -
@@ -137,40 +137,40 @@ jobs:
       run: |
         cd testingDataset
         ppanggolin utils --default_config panrgp -o panrgp_default_config.yaml
-        ppanggolin panrgp  --anno organisms.gbff.list --cluster clusters.tsv -o test_config --config panrgp_default_config.yaml
+        ppanggolin panrgp  --anno genomes.gbff.list --cluster clusters.tsv -o test_config --config panrgp_default_config.yaml
         cd -
     - name: testing projection cmd
       shell: bash -l {0}
       run: |
         cd testingDataset
-        head organisms.gbff.list | sed 's/^/input_org_/g' > organisms.gbff.head.list
-        ppanggolin projection --pangenome stepbystep/pangenome.h5  -o projection_from_list_of_gbff --anno organisms.gbff.head.list --gff --proksee
+        head genomes.gbff.list | sed 's/^/input_genome_/g' > genomes.gbff.head.list
+        ppanggolin projection --pangenome stepbystep/pangenome.h5  -o projection_from_list_of_gbff --anno genomes.gbff.head.list --gff --proksee
 
 
         ppanggolin projection --pangenome mybasicpangenome/pangenome.h5  -o projection_from_single_fasta \
-                              --organism_name chlam_A --fasta FASTA/GCF_002776845.1_ASM277684v1_genomic.fna.gz \
+                              --genome_name chlam_A --fasta FASTA/GCF_002776845.1_ASM277684v1_genomic.fna.gz \
                               --spot_graph --graph_formats graphml --fast --keep_tmp -f --add_sequences --gff --proksee --table --add_metadata
 
         ppanggolin projection --pangenome mybasicpangenome/pangenome.h5  -o projection_from_gff_prodigal \
-                              --organism_name chlam_annotated_with_prodigal --anno GBFF/GCF_003788785.1_ct114V1_genomic_prodigal_annotation.gff.gz \
+                              --genome_name chlam_annotated_with_prodigal --anno GBFF/GCF_003788785.1_ct114V1_genomic_prodigal_annotation.gff.gz \
                                --gff  --table
 
     - name: testing write_genome_cmds
       shell: bash -l {0}
       run: |
         cd testingDataset
-        head organisms.gbff.list | cut -f1  > organisms_names.gbff.head.list
+        head genomes.gbff.list | cut -f1  > genome_names.gbff.head.list
 
-        ppanggolin write_genomes  -p myannopang/pangenome.h5 --output flat_genomes_from_file_org -f \
-                                  --anno organisms.gbff.list --gff --table --organisms  organisms_names.gbff.head.list 
+        ppanggolin write_genomes  -p myannopang/pangenome.h5 --output flat_genomes_from_genome_files -f \
+                                  --anno genomes.gbff.list --gff --table --genomes  genome_names.gbff.head.list 
 
-        ppanggolin write_genomes  -p stepbystep/pangenome.h5 --output flat_genomes_from_cmdline_orgs --proksee \
-                                --organisms GCF_006508185.1_ASM650818v1_genomic,GCF_002088315.1_ASM208831v1_genomic
+        ppanggolin write_genomes  -p stepbystep/pangenome.h5 --output flat_genomes_from_cmdline_genomes --proksee \
+                                --genomes GCF_006508185.1_ASM650818v1_genomic,GCF_002088315.1_ASM208831v1_genomic
         
-        head organisms.fasta.list | cut -f1  > organisms_names.fasta.head.list 
+        head genomes.fasta.list | cut -f1  > genome_names.fasta.head.list 
         # Default separator is a pipe but a pipe is found in a value of metadata db1. That is why we use another separator here. 
         ppanggolin write_genomes -p mybasicpangenome/pangenome.h5 --output mybasicpangenome/genomes_outputs \
-                                --organisms organisms_names.fasta.head.list \
+                                --genomes genome_names.fasta.head.list \
                                   -f --gff --add_metadata --table --metadata_sep § 
 
         # Pipe separatore is found in metadata source db1. if we don't require this source then the writting with pipe is work fine. 

diff --git a/docs/user/Modules/moduleOutputs.md b/docs/user/Modules/moduleOutputs.md
@@ -1,7 +1,7 @@
 ## Module outputs
 
 ### Functional modules
-This .tsv file lists the modules and the gene families that belong to them. It lists one family per line, and there are multiple line for each module.
+This `.tsv` file lists the modules and the gene families that belong to them. It lists one family per line, and there are multiple line for each module.
 It is written along with other files with the following command:
 `ppanggolin write_pangenome -p pangenome.h5 --modules`
 
@@ -54,7 +54,7 @@ The format of the 'modules_spots.tsv' file is the following:
 |module_id| The module identifier|
 |spot_id| the spot identifier|
 
-The file 'modules_RGP_lists.tsv' lists RGPs that have the same modules. Those RGPs can have different gene families, however they will not have any other module than those that are indicated. The format of the 'modules_RGP_lists.tsv' is the following:
+The file `modules_RGP_lists.tsv` lists RGPs that have the same modules. Those RGPs can have different gene families, however they will not have any other module than those that are indicated. The format of the 'modules_RGP_lists.tsv' is the following:
 
 |column|description|
 |------|------------|

diff --git a/docs/user/RGP/rgpOutputs.md b/docs/user/RGP/rgpOutputs.md
@@ -12,7 +12,7 @@ The file has the following format :
 | column | description |
 |--------|-------------|
 | region | a unique identifier for the region. This is usually built from the contig it is on, with a number after it|
-|organism| the organism it is in. This is the organism name provided by the user.|
+|genome| the genome it is in. This is the genome name provided by the user.|
 |start| the start position of the RGP in the contig|
 |stop| the stop position of the RGP in the contig|
 |genes| the number of genes included in the RGP|
@@ -72,7 +72,7 @@ For versions 1.2.30 and above, the 'draw' command can draw specific spots of int
 It will also write a gexf file, which corresponds to the gene families and their organization within the spots. It is basically a subgraph of the pangenome, consisting of the spot itself.
 The command can be used as such:
 
-`ppanggolin draw -p pangenome.h5 --spots all` will draw an interactive .html figure and a gexf file for all the spots.
+`ppanggolin draw -p pangenome.h5 --spots all` will draw an interactive `.html` figure and a `gexf` graph file for all the spots.
 
 If you are interested in only a single spot, you can use its identifier to draw it, as such:
 
@@ -86,7 +86,8 @@ The interactive figures that are drawn look like this:
 
 
 
-The plot represents the different gene organizations that are found in the spot. If there are RGPs with identical gene organization, the organization is represented only once (the represented RGP is picked at random among all identical RGPs). The list of RGPs with the same organization is accessible in the file written alongside the figure called 'spot_X_identical_rgps.tsv', with X the spot_id.
+The plot represents the different gene organizations that are found in the spot. If there are RGPs with identical gene organization, the organization is represented only once (the represented RGP is picked at random among all identical RGPs). The list of RGPs with the same organization is accessible in the file written alongside the figure called `spot_X_identical_rgps.tsv`, with X the spot_id.
 
 They can be edited using the sliders and the radio buttons, to change various graphical parameters, and then the plot itself can be saved using the save button on the right of the screen, if need be.
+
 For the gexf file, you can see how to visualize it in the section about the [pangenome gexf](../PangenomeAnalyses/pangenomeGraphOut.md#pangenome-graph-output).
diff --git a/docs/user/genomicContext.md b/docs/user/genomicContext.md
@@ -24,22 +24,22 @@ The second possibility is to give a list of gene families ID used to compute the
 
 This will search the common connected components in the computed pangenome and export the result in a tsv file.
 
-In this case, you can give a pangenome without gene families representatives sequences. This option is compatible with a pangenome computed with an external clustering (see the [cluster](PangenomeAnalyses/pangenomeBuild.md#clustering) subcommand).
+In this case, you can give a pangenome without gene families representatives sequences. This option is compatible with a pangenome computed with an external clustering (see the [cluster](./PangenomeAnalyses/pangenomeCluster.md) subcommand).
 
 ## Output format
 
 In case of you are using families ID, you will only have as output the `gene_context.tsv` file. In the other case, you use sequences, you will have another output file to report the alignment between sequences and pangenome families (see detail in align subcommand).
 
 There are 6 columns in `gene_context.tsv`. 
 
-1. **geneContext ID**: Identifier of the found context. It is incrementally generated, beginning with 1
-2. **Gene family name**: Identifier of the gene family, from the pangenome, correspond to the found context
-3. **Sequence ID**: Identifier of the searched sequence in the pangenome
-4. **Nb Genomes**: Number of genomes where the genomic context is found
+1. **geneContext_ID**: Identifier of the found context. It is incrementally generated, beginning with 1
+2. **Gene_family_name**: Identifier of the gene family, from the pangenome, correspond to the found context
+3. **Sequence_ID**: Identifier of the searched sequence in the pangenome
+4. **Nb_Genomes**: Number of genomes where the genomic context is found
 5. **Partition**: Partition of the gene family corresponding to the found context
-6. **Target family**: Whether the family is a target family, meaning it matches an input sequence, or a family provided as input.
+6. **Target_family**: Whether the family is a target family, meaning it matches an input sequence, or a family provided as input.
 
-In **sequence Id**, it is possible to find a NA value. This case, correspond to another gene family found in the context.
+In **sequence_Id**, it is possible to find a NA value. This case, correspond to another gene family found in the context.
 
 ## Detailed options
 

diff --git a/ppanggolin/RGP/genomicIsland.py b/ppanggolin/RGP/genomicIsland.py
@@ -346,7 +346,7 @@ def parser_rgp(parser: argparse.ArgumentParser):
     optional.add_argument('--min_length', required=False, type=int, default=3000,
                           help="Minimum length (bp) of a region to be considered a RGP")
     optional.add_argument("--dup_margin", required=False, type=restricted_float, default=0.05,
-                          help="Minimum ratio of organisms where the family is present in which the family must "
+                          help="Minimum ratio of genomes where the family is present in which the family must "
                                "have multiple genes for it to be considered 'duplicated'")
 
 

diff --git a/ppanggolin/RGP/rgp_cluster.py b/ppanggolin/RGP/rgp_cluster.py
@@ -143,7 +143,7 @@ def add_info_to_rgp_nodes(graph, regions: List[Region], region_to_spot: dict):
     region_attributes = {}
     for region in regions:
         region_info = {"contig": region.contig.name,
-                       'organism': region.organism.name,
+                       'genome': region.organism.name,
                        "name": region.name,
                        "genes_count": len(region),
                        "is_contig_border": region.is_contig_border,

diff --git a/ppanggolin/RGP/spot.py b/ppanggolin/RGP/spot.py
@@ -137,7 +137,7 @@ def write_spot_graph(graph_spot, outdir, graph_formats, file_basename="spotGraph
             graph_spot.nodes[node]["border0"] = ';'.join([fam.name for fam in graph_spot.nodes[node]["border0"]])
             graph_spot.nodes[node]["border1"] = ';'.join([fam.name for fam in graph_spot.nodes[node]["border1"]])
 
-            graph_spot.nodes[node]["organisms"] = ';'.join({rgp.organism.name for rgp in graph_spot.nodes[node]["rgp"]})
+            graph_spot.nodes[node]["genomes"] = ';'.join({rgp.organism.name for rgp in graph_spot.nodes[node]["rgp"]})
             graph_spot.nodes[node]["rgp"] = ';'.join([rgp.name for rgp in graph_spot.nodes[node]["rgp"]])
 
         if "gexf" in graph_formats:

diff --git a/ppanggolin/annotate/annotate.py b/ppanggolin/annotate/annotate.py
@@ -495,7 +495,7 @@ def read_annotations(pangenome: Pangenome, organisms_file: Path, cpu: int = 1, p
     :param disable_bar: Disable the progress bar
     """
 
-    logging.getLogger("PPanGGOLiN").info(f"Reading {organisms_file.name} the list of organism files ...")
+    logging.getLogger("PPanGGOLiN").info(f"Reading {organisms_file.name} the list of genome files ...")
 
     pangenome.status["geneSequences"] = "Computed"
     # we assume there are gene sequences in the annotation files,
@@ -551,7 +551,7 @@ def get_gene_sequences_from_fastas(pangenome: Pangenome, fasta_files: Path):
     for line in read_compressed_or_not(fasta_files):
         elements = [el.strip() for el in line.split("\t")]
         if len(elements) <= 1:
-            logging.getLogger("PPanGGOLiN").error("No tabulation separator found in organisms file")
+            logging.getLogger("PPanGGOLiN").error("No tabulation separator found in genome file")
             exit(1)
         try:
             org = pangenome.get_organism(elements[0])
@@ -563,7 +563,7 @@ def get_gene_sequences_from_fastas(pangenome: Pangenome, fasta_files: Path):
             fasta_dict[org] = read_fasta(org, currFastaFile)
     if set(pangenome.organisms) > set(fasta_dict.keys()):
         missing = pangenome.number_of_organisms - len(set(pangenome.organisms) & set(fasta_dict.keys()))
-        raise Exception(f"Not all of your pangenome organisms are present within the provided fasta file. "
+        raise Exception(f"Not all of your pangenome genomes are present within the provided fasta file. "
                         f"{missing} are missing (out of {pangenome.number_of_organisms}).")
 
     for org in pangenome.organisms:
@@ -574,7 +574,7 @@ def get_gene_sequences_from_fastas(pangenome: Pangenome, fasta_files: Path):
                 for rna in contig.RNAs:
                     rna.add_sequence(get_dna_sequence(fasta_dict[org][contig.name], rna))
             except KeyError:
-                msg = f"Fasta file for organism {org.name} did not have the contig {contig.name} " \
+                msg = f"Fasta file for genome {org.name} did not have the contig {contig.name} " \
                       f"that was read from the annotation file. "
                 msg += f"The provided contigs in the fasta were : " \
                        f"{', '.join([contig for contig in fasta_dict[org].keys()])}."
@@ -600,7 +600,7 @@ def annotate_pangenome(pangenome: Pangenome, fasta_list: Path, tmpdir: str, cpu:
     :param disable_bar: Disable the progress bar
     """
 
-    logging.getLogger("PPanGGOLiN").info(f"Reading {fasta_list} the list of organism files")
+    logging.getLogger("PPanGGOLiN").info(f"Reading {fasta_list} the list of genome files")
 
     arguments = []  # Argument given to annotate organism in same order than prototype
     for line in read_compressed_or_not(fasta_list):
@@ -699,11 +699,11 @@ def parser_annot(parser: argparse.ArgumentParser):
     required = parser.add_argument_group(title="Required arguments",
                                          description="One of the following arguments is required :")
     required.add_argument('--fasta', required=False, type=Path,
-                          help="A tab-separated file listing the organism names, and the fasta filepath of its genomic "
-                               "sequence(s) (the fastas can be compressed with gzip). One line per organism.")
+                          help="A tab-separated file listing the genome names, and the fasta filepath of its genomic "
+                               "sequence(s) (the fastas can be compressed with gzip). One line per genome.")
     required.add_argument('--anno', required=False, type=Path,
-                          help="A tab-separated file listing the organism names, and the gff/gbff filepath of its "
-                               "annotations (the files can be compressed with gzip). One line per organism. "
+                          help="A tab-separated file listing the genome names, and the gff/gbff filepath of its "
+                               "annotations (the files can be compressed with gzip). One line per genome. "
                                "If this is provided, those annotations will be used.")
 
     optional = parser.add_argument_group(title="Optional arguments")