From c19a600413ab3ceed5f20a01b8b6728b52329d58 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Mon, 4 Dec 2023 17:23:14 +0100 Subject: [PATCH] replace organism by genome --- .github/workflows/check_recipes.yml | 2 +- .github/workflows/main.yml | 38 +++++------ docs/user/Modules/moduleOutputs.md | 4 +- docs/user/RGP/rgpOutputs.md | 7 +- docs/user/genomicContext.md | 14 ++-- ppanggolin/RGP/genomicIsland.py | 2 +- ppanggolin/RGP/rgp_cluster.py | 2 +- ppanggolin/RGP/spot.py | 2 +- ppanggolin/annotate/annotate.py | 18 +++--- ppanggolin/cluster/cluster.py | 2 +- ppanggolin/context/searchGeneContext.py | 26 ++++---- ppanggolin/edge.py | 4 +- ppanggolin/figures/draw_spot.py | 2 +- ppanggolin/figures/tile_plot.py | 6 +- ppanggolin/formats/readBinaries.py | 8 +-- ppanggolin/formats/writeAnnotations.py | 4 +- ppanggolin/formats/writeBinaries.py | 2 +- ppanggolin/formats/writeFlatGenomes.py | 30 ++++----- ppanggolin/formats/writeFlatPangenome.py | 20 +++--- ppanggolin/formats/writeMSA.py | 6 +- ppanggolin/formats/writeSequences.py | 10 +-- ppanggolin/geneFamily.py | 2 +- ppanggolin/genome.py | 10 +-- ppanggolin/graph/makeGraph.py | 6 +- ppanggolin/meta/meta.py | 2 +- ppanggolin/metrics/fluidity.py | 6 +- ppanggolin/mod/module.py | 2 +- ppanggolin/nem/partition.py | 10 +-- ppanggolin/nem/rarefaction.py | 20 +++--- ppanggolin/pangenome.py | 12 ++-- ppanggolin/projection/projection.py | 64 +++++++++---------- ppanggolin/region.py | 2 +- ppanggolin/utils.py | 4 +- ppanggolin/workflow/all.py | 10 +-- ...rganisms.fasta.list => genomes.fasta.list} | 0 ...{organisms.gbff.list => genomes.gbff.list} | 0 36 files changed, 180 insertions(+), 179 deletions(-) rename testingDataset/{organisms.fasta.list => genomes.fasta.list} (100%) rename testingDataset/{organisms.gbff.list => genomes.gbff.list} (100%) diff --git a/.github/workflows/check_recipes.yml b/.github/workflows/check_recipes.yml index 8fb0bf7f..49f3fb89 100644 --- a/.github/workflows/check_recipes.yml +++ b/.github/workflows/check_recipes.yml @@ -42,5 +42,5 @@ jobs: shell: bash -l {0} run: | cd testingDataset/ - ppanggolin all --cpu 1 --anno organisms.gbff.list -o pango + ppanggolin all --cpu 1 --anno genomes.gbff.list -o pango ppanggolin info -p pango/pangenome.h5 --content --parameters --status diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 230c9c96..915dc32c 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -50,7 +50,7 @@ jobs: shell: bash -l {0} run: | cd testingDataset - ppanggolin all --cpu 1 --fasta organisms.fasta.list --output mybasicpangenome + ppanggolin all --cpu 1 --fasta genomes.fasta.list --output mybasicpangenome ppanggolin info --pangenome mybasicpangenome/pangenome.h5 --content --parameters --status cd - # test most options calls. If there is a change in the API somewhere that was not taken into account (whether in the options for the users, or the classes for the devs), this should fail, otherwise everything is probably good. @@ -59,7 +59,7 @@ jobs: shell: bash -l {0} run: | cd testingDataset - ppanggolin annotate --fasta organisms.fasta.list --output stepbystep --kingdom bacteria + ppanggolin annotate --fasta genomes.fasta.list --output stepbystep --kingdom bacteria ppanggolin cluster -p stepbystep/pangenome.h5 --coverage 0.8 --identity 0.8 ppanggolin graph -p stepbystep/pangenome.h5 -r 10 ppanggolin partition --output stepbystep -f -p stepbystep/pangenome.h5 --cpu 1 -b 2.6 -ms 10 -fd -ck 500 -Kmm 3 12 -im 0.04 --draw_ICL -se $RANDOM @@ -70,8 +70,8 @@ jobs: ppanggolin draw -p stepbystep/pangenome.h5 --draw_spots -o stepbystep -f ppanggolin module -p stepbystep/pangenome.h5 --transitive 4 --size 3 --jaccard 0.86 --dup_margin 0.05 ppanggolin write_pangenome -p stepbystep/pangenome.h5 --output stepbystep -f --soft_core 0.9 --dup_margin 0.06 --gexf --light_gexf --csv --Rtab --stats --partitions --compress --json --spots --borders --families_tsv --cpu 1 - ppanggolin write_genomes -p stepbystep/pangenome.h5 --output stepbystep -f --fasta organisms.fasta.list --gff --proksee --table - ppanggolin fasta -p stepbystep/pangenome.h5 --output stepbystep -f --prot_families all --gene_families shell --regions all --fasta organisms.fasta.list + ppanggolin write_genomes -p stepbystep/pangenome.h5 --output stepbystep -f --fasta genomes.fasta.list --gff --proksee --table + ppanggolin fasta -p stepbystep/pangenome.h5 --output stepbystep -f --prot_families all --gene_families shell --regions all --fasta genomes.fasta.list ppanggolin draw -p stepbystep/pangenome.h5 --draw_spots --spots all -o stepbystep -f ppanggolin metrics -p stepbystep/pangenome.h5 --genome_fluidity --info_modules --no_print_info -f --log metrics.log cd - @@ -79,15 +79,15 @@ jobs: shell: bash -l {0} run: | cd testingDataset - ppanggolin workflow --cpu 1 --anno organisms.gbff.list --output myannopang + ppanggolin workflow --cpu 1 --anno genomes.gbff.list --output myannopang ppanggolin msa --pangenome myannopang/pangenome.h5 --source dna --partition core -o myannopang/ -f --use_gene_id --phylo --single_copy cd - - name: clusters reading from external file shell: bash -l {0} run: | cd testingDataset - ppanggolin panrgp --anno organisms.gbff.list --cluster clusters.tsv --output readclusterpang - ppanggolin annotate --anno organisms.gbff.list --output readclusters + ppanggolin panrgp --anno genomes.gbff.list --cluster clusters.tsv --output readclusterpang + ppanggolin annotate --anno genomes.gbff.list --output readclusters ppanggolin cluster --clusters clusters.tsv -p readclusters/pangenome.h5 ppanggolin msa --pangenome readclusterpang/pangenome.h5 --partition persistent --phylo -o readclusterpang/msa/ -f cd - @@ -137,40 +137,40 @@ jobs: run: | cd testingDataset ppanggolin utils --default_config panrgp -o panrgp_default_config.yaml - ppanggolin panrgp --anno organisms.gbff.list --cluster clusters.tsv -o test_config --config panrgp_default_config.yaml + ppanggolin panrgp --anno genomes.gbff.list --cluster clusters.tsv -o test_config --config panrgp_default_config.yaml cd - - name: testing projection cmd shell: bash -l {0} run: | cd testingDataset - head organisms.gbff.list | sed 's/^/input_org_/g' > organisms.gbff.head.list - ppanggolin projection --pangenome stepbystep/pangenome.h5 -o projection_from_list_of_gbff --anno organisms.gbff.head.list --gff --proksee + head genomes.gbff.list | sed 's/^/input_genome_/g' > genomes.gbff.head.list + ppanggolin projection --pangenome stepbystep/pangenome.h5 -o projection_from_list_of_gbff --anno genomes.gbff.head.list --gff --proksee ppanggolin projection --pangenome mybasicpangenome/pangenome.h5 -o projection_from_single_fasta \ - --organism_name chlam_A --fasta FASTA/GCF_002776845.1_ASM277684v1_genomic.fna.gz \ + --genome_name chlam_A --fasta FASTA/GCF_002776845.1_ASM277684v1_genomic.fna.gz \ --spot_graph --graph_formats graphml --fast --keep_tmp -f --add_sequences --gff --proksee --table --add_metadata ppanggolin projection --pangenome mybasicpangenome/pangenome.h5 -o projection_from_gff_prodigal \ - --organism_name chlam_annotated_with_prodigal --anno GBFF/GCF_003788785.1_ct114V1_genomic_prodigal_annotation.gff.gz \ + --genome_name chlam_annotated_with_prodigal --anno GBFF/GCF_003788785.1_ct114V1_genomic_prodigal_annotation.gff.gz \ --gff --table - name: testing write_genome_cmds shell: bash -l {0} run: | cd testingDataset - head organisms.gbff.list | cut -f1 > organisms_names.gbff.head.list + head genomes.gbff.list | cut -f1 > genome_names.gbff.head.list - ppanggolin write_genomes -p myannopang/pangenome.h5 --output flat_genomes_from_file_org -f \ - --anno organisms.gbff.list --gff --table --organisms organisms_names.gbff.head.list + ppanggolin write_genomes -p myannopang/pangenome.h5 --output flat_genomes_from_genome_files -f \ + --anno genomes.gbff.list --gff --table --genomes genome_names.gbff.head.list - ppanggolin write_genomes -p stepbystep/pangenome.h5 --output flat_genomes_from_cmdline_orgs --proksee \ - --organisms GCF_006508185.1_ASM650818v1_genomic,GCF_002088315.1_ASM208831v1_genomic + ppanggolin write_genomes -p stepbystep/pangenome.h5 --output flat_genomes_from_cmdline_genomes --proksee \ + --genomes GCF_006508185.1_ASM650818v1_genomic,GCF_002088315.1_ASM208831v1_genomic - head organisms.fasta.list | cut -f1 > organisms_names.fasta.head.list + head genomes.fasta.list | cut -f1 > genome_names.fasta.head.list # Default separator is a pipe but a pipe is found in a value of metadata db1. That is why we use another separator here. ppanggolin write_genomes -p mybasicpangenome/pangenome.h5 --output mybasicpangenome/genomes_outputs \ - --organisms organisms_names.fasta.head.list \ + --genomes genome_names.fasta.head.list \ -f --gff --add_metadata --table --metadata_sep ยง # Pipe separatore is found in metadata source db1. if we don't require this source then the writting with pipe is work fine. diff --git a/docs/user/Modules/moduleOutputs.md b/docs/user/Modules/moduleOutputs.md index dc5a77e8..eb513136 100644 --- a/docs/user/Modules/moduleOutputs.md +++ b/docs/user/Modules/moduleOutputs.md @@ -1,7 +1,7 @@ ## Module outputs ### Functional modules -This .tsv file lists the modules and the gene families that belong to them. It lists one family per line, and there are multiple line for each module. +This `.tsv` file lists the modules and the gene families that belong to them. It lists one family per line, and there are multiple line for each module. It is written along with other files with the following command: `ppanggolin write_pangenome -p pangenome.h5 --modules` @@ -54,7 +54,7 @@ The format of the 'modules_spots.tsv' file is the following: |module_id| The module identifier| |spot_id| the spot identifier| -The file 'modules_RGP_lists.tsv' lists RGPs that have the same modules. Those RGPs can have different gene families, however they will not have any other module than those that are indicated. The format of the 'modules_RGP_lists.tsv' is the following: +The file `modules_RGP_lists.tsv` lists RGPs that have the same modules. Those RGPs can have different gene families, however they will not have any other module than those that are indicated. The format of the 'modules_RGP_lists.tsv' is the following: |column|description| |------|------------| diff --git a/docs/user/RGP/rgpOutputs.md b/docs/user/RGP/rgpOutputs.md index f5664df4..e2a76cd8 100644 --- a/docs/user/RGP/rgpOutputs.md +++ b/docs/user/RGP/rgpOutputs.md @@ -12,7 +12,7 @@ The file has the following format : | column | description | |--------|-------------| | region | a unique identifier for the region. This is usually built from the contig it is on, with a number after it| -|organism| the organism it is in. This is the organism name provided by the user.| +|genome| the genome it is in. This is the genome name provided by the user.| |start| the start position of the RGP in the contig| |stop| the stop position of the RGP in the contig| |genes| the number of genes included in the RGP| @@ -72,7 +72,7 @@ For versions 1.2.30 and above, the 'draw' command can draw specific spots of int It will also write a gexf file, which corresponds to the gene families and their organization within the spots. It is basically a subgraph of the pangenome, consisting of the spot itself. The command can be used as such: -`ppanggolin draw -p pangenome.h5 --spots all` will draw an interactive .html figure and a gexf file for all the spots. +`ppanggolin draw -p pangenome.h5 --spots all` will draw an interactive `.html` figure and a `gexf` graph file for all the spots. If you are interested in only a single spot, you can use its identifier to draw it, as such: @@ -86,7 +86,8 @@ The interactive figures that are drawn look like this: -The plot represents the different gene organizations that are found in the spot. If there are RGPs with identical gene organization, the organization is represented only once (the represented RGP is picked at random among all identical RGPs). The list of RGPs with the same organization is accessible in the file written alongside the figure called 'spot_X_identical_rgps.tsv', with X the spot_id. +The plot represents the different gene organizations that are found in the spot. If there are RGPs with identical gene organization, the organization is represented only once (the represented RGP is picked at random among all identical RGPs). The list of RGPs with the same organization is accessible in the file written alongside the figure called `spot_X_identical_rgps.tsv`, with X the spot_id. They can be edited using the sliders and the radio buttons, to change various graphical parameters, and then the plot itself can be saved using the save button on the right of the screen, if need be. + For the gexf file, you can see how to visualize it in the section about the [pangenome gexf](../PangenomeAnalyses/pangenomeGraphOut.md#pangenome-graph-output). \ No newline at end of file diff --git a/docs/user/genomicContext.md b/docs/user/genomicContext.md index 1698d03f..b1af1d64 100644 --- a/docs/user/genomicContext.md +++ b/docs/user/genomicContext.md @@ -24,7 +24,7 @@ The second possibility is to give a list of gene families ID used to compute the This will search the common connected components in the computed pangenome and export the result in a tsv file. -In this case, you can give a pangenome without gene families representatives sequences. This option is compatible with a pangenome computed with an external clustering (see the [cluster](PangenomeAnalyses/pangenomeBuild.md#clustering) subcommand). +In this case, you can give a pangenome without gene families representatives sequences. This option is compatible with a pangenome computed with an external clustering (see the [cluster](./PangenomeAnalyses/pangenomeCluster.md) subcommand). ## Output format @@ -32,14 +32,14 @@ In case of you are using families ID, you will only have as output the `gene_con There are 6 columns in `gene_context.tsv`. -1. **geneContext ID**: Identifier of the found context. It is incrementally generated, beginning with 1 -2. **Gene family name**: Identifier of the gene family, from the pangenome, correspond to the found context -3. **Sequence ID**: Identifier of the searched sequence in the pangenome -4. **Nb Genomes**: Number of genomes where the genomic context is found +1. **geneContext_ID**: Identifier of the found context. It is incrementally generated, beginning with 1 +2. **Gene_family_name**: Identifier of the gene family, from the pangenome, correspond to the found context +3. **Sequence_ID**: Identifier of the searched sequence in the pangenome +4. **Nb_Genomes**: Number of genomes where the genomic context is found 5. **Partition**: Partition of the gene family corresponding to the found context -6. **Target family**: Whether the family is a target family, meaning it matches an input sequence, or a family provided as input. +6. **Target_family**: Whether the family is a target family, meaning it matches an input sequence, or a family provided as input. -In **sequence Id**, it is possible to find a NA value. This case, correspond to another gene family found in the context. +In **sequence_Id**, it is possible to find a NA value. This case, correspond to another gene family found in the context. ## Detailed options diff --git a/ppanggolin/RGP/genomicIsland.py b/ppanggolin/RGP/genomicIsland.py index 96cf54f6..b769905f 100644 --- a/ppanggolin/RGP/genomicIsland.py +++ b/ppanggolin/RGP/genomicIsland.py @@ -346,7 +346,7 @@ def parser_rgp(parser: argparse.ArgumentParser): optional.add_argument('--min_length', required=False, type=int, default=3000, help="Minimum length (bp) of a region to be considered a RGP") optional.add_argument("--dup_margin", required=False, type=restricted_float, default=0.05, - help="Minimum ratio of organisms where the family is present in which the family must " + help="Minimum ratio of genomes where the family is present in which the family must " "have multiple genes for it to be considered 'duplicated'") diff --git a/ppanggolin/RGP/rgp_cluster.py b/ppanggolin/RGP/rgp_cluster.py index 737790fa..bca53fe3 100644 --- a/ppanggolin/RGP/rgp_cluster.py +++ b/ppanggolin/RGP/rgp_cluster.py @@ -143,7 +143,7 @@ def add_info_to_rgp_nodes(graph, regions: List[Region], region_to_spot: dict): region_attributes = {} for region in regions: region_info = {"contig": region.contig.name, - 'organism': region.organism.name, + 'genome': region.organism.name, "name": region.name, "genes_count": len(region), "is_contig_border": region.is_contig_border, diff --git a/ppanggolin/RGP/spot.py b/ppanggolin/RGP/spot.py index 00b6e7a6..5cb492c0 100644 --- a/ppanggolin/RGP/spot.py +++ b/ppanggolin/RGP/spot.py @@ -137,7 +137,7 @@ def write_spot_graph(graph_spot, outdir, graph_formats, file_basename="spotGraph graph_spot.nodes[node]["border0"] = ';'.join([fam.name for fam in graph_spot.nodes[node]["border0"]]) graph_spot.nodes[node]["border1"] = ';'.join([fam.name for fam in graph_spot.nodes[node]["border1"]]) - graph_spot.nodes[node]["organisms"] = ';'.join({rgp.organism.name for rgp in graph_spot.nodes[node]["rgp"]}) + graph_spot.nodes[node]["genomes"] = ';'.join({rgp.organism.name for rgp in graph_spot.nodes[node]["rgp"]}) graph_spot.nodes[node]["rgp"] = ';'.join([rgp.name for rgp in graph_spot.nodes[node]["rgp"]]) if "gexf" in graph_formats: diff --git a/ppanggolin/annotate/annotate.py b/ppanggolin/annotate/annotate.py index 59b81257..5794a08e 100644 --- a/ppanggolin/annotate/annotate.py +++ b/ppanggolin/annotate/annotate.py @@ -495,7 +495,7 @@ def read_annotations(pangenome: Pangenome, organisms_file: Path, cpu: int = 1, p :param disable_bar: Disable the progress bar """ - logging.getLogger("PPanGGOLiN").info(f"Reading {organisms_file.name} the list of organism files ...") + logging.getLogger("PPanGGOLiN").info(f"Reading {organisms_file.name} the list of genome files ...") pangenome.status["geneSequences"] = "Computed" # we assume there are gene sequences in the annotation files, @@ -551,7 +551,7 @@ def get_gene_sequences_from_fastas(pangenome: Pangenome, fasta_files: Path): for line in read_compressed_or_not(fasta_files): elements = [el.strip() for el in line.split("\t")] if len(elements) <= 1: - logging.getLogger("PPanGGOLiN").error("No tabulation separator found in organisms file") + logging.getLogger("PPanGGOLiN").error("No tabulation separator found in genome file") exit(1) try: org = pangenome.get_organism(elements[0]) @@ -563,7 +563,7 @@ def get_gene_sequences_from_fastas(pangenome: Pangenome, fasta_files: Path): fasta_dict[org] = read_fasta(org, currFastaFile) if set(pangenome.organisms) > set(fasta_dict.keys()): missing = pangenome.number_of_organisms - len(set(pangenome.organisms) & set(fasta_dict.keys())) - raise Exception(f"Not all of your pangenome organisms are present within the provided fasta file. " + raise Exception(f"Not all of your pangenome genomes are present within the provided fasta file. " f"{missing} are missing (out of {pangenome.number_of_organisms}).") for org in pangenome.organisms: @@ -574,7 +574,7 @@ def get_gene_sequences_from_fastas(pangenome: Pangenome, fasta_files: Path): for rna in contig.RNAs: rna.add_sequence(get_dna_sequence(fasta_dict[org][contig.name], rna)) except KeyError: - msg = f"Fasta file for organism {org.name} did not have the contig {contig.name} " \ + msg = f"Fasta file for genome {org.name} did not have the contig {contig.name} " \ f"that was read from the annotation file. " msg += f"The provided contigs in the fasta were : " \ f"{', '.join([contig for contig in fasta_dict[org].keys()])}." @@ -600,7 +600,7 @@ def annotate_pangenome(pangenome: Pangenome, fasta_list: Path, tmpdir: str, cpu: :param disable_bar: Disable the progress bar """ - logging.getLogger("PPanGGOLiN").info(f"Reading {fasta_list} the list of organism files") + logging.getLogger("PPanGGOLiN").info(f"Reading {fasta_list} the list of genome files") arguments = [] # Argument given to annotate organism in same order than prototype for line in read_compressed_or_not(fasta_list): @@ -699,11 +699,11 @@ def parser_annot(parser: argparse.ArgumentParser): required = parser.add_argument_group(title="Required arguments", description="One of the following arguments is required :") required.add_argument('--fasta', required=False, type=Path, - help="A tab-separated file listing the organism names, and the fasta filepath of its genomic " - "sequence(s) (the fastas can be compressed with gzip). One line per organism.") + help="A tab-separated file listing the genome names, and the fasta filepath of its genomic " + "sequence(s) (the fastas can be compressed with gzip). One line per genome.") required.add_argument('--anno', required=False, type=Path, - help="A tab-separated file listing the organism names, and the gff/gbff filepath of its " - "annotations (the files can be compressed with gzip). One line per organism. " + help="A tab-separated file listing the genome names, and the gff/gbff filepath of its " + "annotations (the files can be compressed with gzip). One line per genome. " "If this is provided, those annotations will be used.") optional = parser.add_argument_group(title="Optional arguments") diff --git a/ppanggolin/cluster/cluster.py b/ppanggolin/cluster/cluster.py index 3e3f3e85..1499dd5d 100644 --- a/ppanggolin/cluster/cluster.py +++ b/ppanggolin/cluster/cluster.py @@ -266,7 +266,7 @@ def read_gene2fam(pangenome: Pangenome, gene_to_fam: dict, disable_bar: bool = F logging.getLogger("PPanGGOLiN").debug(f"gene_to_fam size: {len(gene_to_fam)}, " f"Pangenome nb genes: {pangenome.number_of_genes}") raise Exception("Something unexpected happened during clustering (have less genes clustered than genes " - "in the pangenome). A probable reason is that two genes in two different organisms have " + "in the pangenome). A probable reason is that two genes in two different genomes have " "the same IDs; If you are sure that all of your genes have non identical IDs, please post an " "issue at https://github.com/labgem/PPanGGOLiN/") for gene, (family, is_frag) in tqdm(gene_to_fam.items(), unit="gene", total=len(gene_to_fam), disable=disable_bar): diff --git a/ppanggolin/context/searchGeneContext.py b/ppanggolin/context/searchGeneContext.py index 0114664b..2a248568 100644 --- a/ppanggolin/context/searchGeneContext.py +++ b/ppanggolin/context/searchGeneContext.py @@ -233,7 +233,7 @@ def filter_attribute(data: dict): # on top of attributes already contained in node of context graph # add organisms and genes count that have the family, the partition and if the family was in initially requested - nodes_family_data = {f.name: {"organisms": f.number_of_organisms, + nodes_family_data = {f.name: {"genomes": f.number_of_organisms, "partition": f.named_partition, "genes": f.number_of_genes} for f in context_graph.nodes()} @@ -278,7 +278,7 @@ def compute_edge_metrics(context_graph: nx.Graph, gene_proportion_cutoff: float) """ # compute jaccard on organism and on genes for f1, f2, data in context_graph.edges(data=True): - data['jaccard_organism'] = len(data['organisms']) / len(set(f1.organisms) | set(f2.organisms)) + data['jaccard_genome'] = len(data['genomes']) / len(set(f1.organisms) | set(f2.organisms)) f1_gene_proportion = len(data['genes'][f1]) / f1.number_of_genes f2_gene_proportion = len(data['genes'][f2]) / f2.number_of_genes @@ -301,8 +301,8 @@ def compute_edge_metrics(context_graph: nx.Graph, gene_proportion_cutoff: float) # the following commented out lines are additional metrics that could be used - # data['min_jaccard_organism'] = len(data['organisms'])/min(len(f1.organisms), len(f2.organisms)) - # data['max_jaccard_organism'] = len(data['organisms'])/max(len(f1.organisms), len(f2.organisms)) + # data['min_jaccard_genome'] = len(data['genomes'])/min(len(f1.genomes), len(f2.genomes)) + # data['max_jaccard_genome'] = len(data['genomes'])/max(len(f1.genomes), len(f2.genomes)) # f1_gene_proportion_partial = len(data['genes'][f1])/len(context_graph.nodes[f1]['genes']) # f2_gene_proportion_partial = len(data['genes'][f2])/len(context_graph.nodes[f2]['genes']) # data[f'f1_jaccard_gene_partital'] = f1_gene_proportion_partial @@ -377,11 +377,11 @@ def add_edges_to_context_graph(context_graph: nx.Graph, add_val_to_dict_attribute(genes_edge_dict, gene.family, gene) add_val_to_dict_attribute(genes_edge_dict, next_gene.family, next_gene) - add_val_to_dict_attribute(edge_dict, "organisms", gene.organism) + add_val_to_dict_attribute(edge_dict, "genomes", gene.organism) increment_attribute_counter(edge_dict, "gene_pairs") - assert gene.organism == next_gene.organism, (f"Gene of the same contig have a different organism. " + assert gene.organism == next_gene.organism, (f"Gene of the same contig have a different genome. " f"{gene.organism} and {next_gene.organism}") @@ -536,18 +536,18 @@ def export_context_to_dataframe(gene_contexts: set, fam2seq: Dict[str, int], else: sequence_id = ','.join(fam2seq.get(family)) - family_info = {"GeneContext ID": gene_context.ID, - "Gene family name": family.name, - "Sequence ID": sequence_id, - "Nb Genomes": family.number_of_organisms, + family_info = {"GeneContext_ID": gene_context.ID, + "Gene_family_name": family.name, + "Sequence_ID": sequence_id, + "Nb_Genomes": family.number_of_organisms, "Partition": family.named_partition, - "Target family": family in families_of_interest} + "Target_family": family in families_of_interest} lines.append(family_info) - df = pd.DataFrame(lines).set_index("GeneContext ID") + df = pd.DataFrame(lines).set_index("GeneContext_ID") - df = df.sort_values(["GeneContext ID", "Sequence ID"], na_position='last') + df = df.sort_values(["GeneContext_ID", "Sequence_ID"], na_position='last') df.to_csv(output, sep="\t", na_rep='NA') diff --git a/ppanggolin/edge.py b/ppanggolin/edge.py index 6a1b0840..2d21f334 100644 --- a/ppanggolin/edge.py +++ b/ppanggolin/edge.py @@ -101,8 +101,8 @@ def add_genes(self, source_gene: Gene, target_gene: Gene): raise TypeError(f"Genes are expected to be added to edge. " f"Given type for source: {type(source_gene)} and target: {type(target_gene)}") if source_gene.organism is None or target_gene.organism is None: - raise ValueError("Genes are not associated to organism. It's needed to create add genes to edge") + raise ValueError("Genes are not associated to genome. It's needed to create add genes to edge") if source_gene.organism != target_gene.organism: - raise Exception(f"You tried to create an edge between two genes that are not even in the same organism ! " + raise Exception(f"You tried to create an edge between two genes that are not even in the same genome ! " f"(genes are '{source_gene.ID}' and '{target_gene.ID}')") self._organisms[source_gene.organism].append((source_gene, target_gene)) diff --git a/ppanggolin/figures/draw_spot.py b/ppanggolin/figures/draw_spot.py index d40b38c3..951711e6 100644 --- a/ppanggolin/figures/draw_spot.py +++ b/ppanggolin/figures/draw_spot.py @@ -579,7 +579,7 @@ def draw_selected_spots(selected_spots: Union[List[Spot], Set[Spot]], pangenome: # write rgps representatives and the rgps they are identical to out_struc = open(fname.absolute().as_posix() + '_identical_rgps.tsv', 'w') - out_struc.write('representative_rgp\trepresentative_rgp_organism\tidentical_rgp\tidentical_rgp_organism\n') + out_struc.write('representative_rgp\trepresentative_rgp_genome\tidentical_rgp\tidentical_rgp_genome\n') for key_rgp, other_rgps in spot.get_uniq_to_rgp().items(): for rgp in other_rgps: out_struc.write(f"{key_rgp.name}\t{key_rgp.organism.name}\t{rgp.name}\t{rgp.organism.name}\n") diff --git a/ppanggolin/figures/tile_plot.py b/ppanggolin/figures/tile_plot.py index 6cd6161f..bf5b1012 100644 --- a/ppanggolin/figures/tile_plot.py +++ b/ppanggolin/figures/tile_plot.py @@ -35,7 +35,7 @@ def draw_tile_plot(pangenome: Pangenome, output: Path, nocloud: bool = False, di if pangenome.status["partitioned"] == "No": raise Exception("Cannot draw the tile plot as your pangenome has not been partitioned") if pangenome.number_of_organisms > 500 and nocloud is False: - logging.getLogger("PPanGGOLiN").warning("You asked to draw a tile plot for a lot of organisms (>500). " + logging.getLogger("PPanGGOLiN").warning("You asked to draw a tile plot for a lot of genomes (>500). " "Your browser will probably not be able to open it.") logging.getLogger("PPanGGOLiN").info("Drawing the tile plot...") data = [] @@ -71,7 +71,7 @@ def draw_tile_plot(pangenome: Pangenome, output: Path, nocloud: bool = False, di hc = linkage(dist, 'single') dendro_org = dendrogram(hc, no_plot=True) - logging.getLogger("PPanGGOLiN").info("done with making the dendrogram to order the organisms on the plot") + logging.getLogger("PPanGGOLiN").info("done with making the dendrogram to order the genomes on the plot") order_organisms = [index2org[index] for index in dendro_org["leaves"]] @@ -161,7 +161,7 @@ def draw_tile_plot(pangenome: Pangenome, output: Path, nocloud: bool = False, di layout = go.Layout(title="presence/absence matrix", xaxis=go.layout.XAxis(ticktext=xaxis_values, - title='organisms', + title='genomes', tickvals=xaxis_values, automargin=True, tickfont=dict(size=10)), diff --git a/ppanggolin/formats/readBinaries.py b/ppanggolin/formats/readBinaries.py index de66b27a..a6c5de0f 100644 --- a/ppanggolin/formats/readBinaries.py +++ b/ppanggolin/formats/readBinaries.py @@ -78,7 +78,7 @@ def get_number_of_organisms(pangenome: Pangenome) -> int: table = annotations.genes org_set = set() - for org in read_chunks(table, column="organism"): + for org in read_chunks(table, column="genome"): org_set.add(org) h5f.close() return len(org_set) @@ -452,7 +452,7 @@ def read_contigs(pangenome: Pangenome, table: tables.Table, chunk_size: int = 20 contig = Contig(identifier=int(row["ID"]), name=row["name"].decode(), is_circular=row["is_circular"]) contig.length = int(row["length"]) try: - organism = pangenome.get_organism(row["organism"].decode()) + organism = pangenome.get_organism(row["genome"].decode()) except KeyError: pass else: @@ -548,8 +548,8 @@ def read_info(h5f: tables.File): info_group = h5f.root.info print("Content: ") print(f"\t- Genes: {info_group._v_attrs['numberOfGenes']}") - if "numberOfOrganisms" in info_group._v_attrs._f_list(): - print(f"\t- Organisms: {info_group._v_attrs['numberOfOrganisms']}") + if "numberOfGenomes" in info_group._v_attrs._f_list(): + print(f"\t- Genomes: {info_group._v_attrs['numberOfGenomes']}") if "numberOfClusters" in info_group._v_attrs._f_list(): print(f"\t- Families: {info_group._v_attrs['numberOfClusters']}") if "numberOfEdges" in info_group._v_attrs._f_list(): diff --git a/ppanggolin/formats/writeAnnotations.py b/ppanggolin/formats/writeAnnotations.py index bb1de011..239264b2 100644 --- a/ppanggolin/formats/writeAnnotations.py +++ b/ppanggolin/formats/writeAnnotations.py @@ -88,7 +88,7 @@ def contig_desc(contig_len: int, org_len: int) -> Dict[str, Union[tables.StringC 'name': tables.StringCol(itemsize=contig_len), "is_circular": tables.BoolCol(dflt=False), 'length': tables.UInt32Col(), - "organism": tables.StringCol(itemsize=org_len)} + "genome": tables.StringCol(itemsize=org_len)} def write_contigs(pangenome: Pangenome, h5f: tables.File, annotation: tables.Group, @@ -109,7 +109,7 @@ def write_contigs(pangenome: Pangenome, h5f: tables.File, annotation: tables.Gro contig_row["name"] = contig.name contig_row["is_circular"] = contig.is_circular contig_row["length"] = len(contig) - contig_row["organism"] = contig.organism.name + contig_row["genome"] = contig.organism.name contig_row.append() contig_table.flush() diff --git a/ppanggolin/formats/writeBinaries.py b/ppanggolin/formats/writeBinaries.py index c45cdd76..5aa84dfb 100644 --- a/ppanggolin/formats/writeBinaries.py +++ b/ppanggolin/formats/writeBinaries.py @@ -447,7 +447,7 @@ def getmin(arg: iter) -> float: info_group = h5f.create_group("/", "info", "Informations about the pangenome content") if pangenome.status["genomesAnnotated"] in ["Computed", "Loaded"]: info_group._v_attrs.numberOfGenes = pangenome.number_of_genes - info_group._v_attrs.numberOfOrganisms = pangenome.number_of_organisms + info_group._v_attrs.numberOfGenomes = pangenome.number_of_organisms if pangenome.status["genesClustered"] in ["Computed", "Loaded"]: info_group._v_attrs.numberOfClusters = pangenome.number_of_gene_families if pangenome.status["neighborsGraph"] in ["Computed", "Loaded"]: diff --git a/ppanggolin/formats/writeFlatGenomes.py b/ppanggolin/formats/writeFlatGenomes.py index 773c8543..77a3a673 100644 --- a/ppanggolin/formats/writeFlatGenomes.py +++ b/ppanggolin/formats/writeFlatGenomes.py @@ -77,7 +77,7 @@ def write_tsv_genome_file(organism: Organism, output: Path, compress: bool = Fal gene_info["stop"] = gene.stop gene_info["strand"] = gene.strand gene_info["family"] = gene.family.name - gene_info["nb_copy_in_org"] = len(list(gene.family.get_genes_per_org(organism))) + gene_info["nb_copy_in_genome"] = len(list(gene.family.get_genes_per_org(organism))) gene_info["partition"] = gene.family.named_partition gene_info["persistent_neighbors"] = nb_pers gene_info["shell_neighbors"] = nb_shell @@ -390,7 +390,7 @@ def get_organism_list(organisms_filt: str, pangenome: Pangenome) -> Set[Organism else: if Path(organisms_filt).is_file(): - logging.getLogger("PPanGGOLiN").debug("Parsing the list of organisms from a file " + logging.getLogger("PPanGGOLiN").debug("Parsing the list of genomes from a file " "to determine which genomes should be included in the output.") with open(organisms_filt) as fl: org_names = [line.strip() for line in fl if line and not line.startswith("#")] @@ -406,7 +406,7 @@ def get_organism_list(organisms_filt: str, pangenome: Pangenome) -> Set[Organism except KeyError: org_not_in_pangenome.add(org_name) if org_not_in_pangenome: - raise KeyError(f"{len(org_not_in_pangenome)} organism(s) specified with '--organisms' parameter were " + raise KeyError(f"{len(org_not_in_pangenome)} organism(s) specified with '--genomes' parameter were " f"not found in the pangenome: {', '.join(org_not_in_pangenome)}") logging.getLogger("PPanGGOLiN").info( @@ -508,7 +508,7 @@ def write_flat_genome_files(pangenome: Pangenome, output: Path, table: bool = Fa organisms_list = get_organism_list(organisms_filt, pangenome) if not organisms_list: - raise ValueError("No genomes are selected for output. Please check the '--organisms' parameter.") + raise ValueError("No genomes are selected for output. Please check the '--genomes' parameter.") org_dict = parse_input_paths_file(organisms_file) if organisms_file and (gff or proksee) else None @@ -519,7 +519,7 @@ def write_flat_genome_files(pangenome: Pangenome, output: Path, table: bool = Fa organism2args = defaultdict(lambda: {"output": output, "table": table, "gff": gff, "proksee": proksee, "compress": compress}) for organism in organisms_list: - organism_args = {"organisms_file": org_dict[organism.name]['path'] if org_dict else None} + organism_args = {"genome_file": org_dict[organism.name]['path'] if org_dict else None} if proksee: organism_args["module_to_colors"] = {module: module_to_colors[module] for module in organism.modules} @@ -540,7 +540,7 @@ def write_flat_genome_files(pangenome: Pangenome, output: Path, table: bool = Fa start_writing = time.time() with ThreadPoolExecutor(max_workers=cpu) as executor: - with tqdm(total=(len(organisms_list)), unit="organism", disable=disable_bar) as progress: + with tqdm(total=(len(organisms_list)), unit="genome", disable=disable_bar) as progress: futures = [] for organism, kwargs in organism2args.items(): logging.getLogger("PPanGGOLiN").debug(f"Writing genome annotations for {organism.name}") @@ -569,7 +569,7 @@ def launch(args: argparse.Namespace): pangenome.add_file(args.pangenome) write_flat_genome_files(pangenome, args.output, table=args.table, gff=args.gff, proksee=args.proksee, - compress=args.compress, fasta=args.fasta, anno=args.anno, organisms_filt=args.organisms, + compress=args.compress, fasta=args.fasta, anno=args.anno, organisms_filt=args.genomes, add_metadata=args.add_metadata, metadata_sep=args.metadata_sep, metadata_sources=args.metadata_sources, cpu=args.cpu, disable_bar=args.disable_prog_bar) @@ -613,12 +613,12 @@ def parser_flat(parser: argparse.ArgumentParser): optional.add_argument("--compress", required=False, action="store_true", help="Compress the files in .gz") - optional.add_argument("--organisms", + optional.add_argument("--genomes", required=False, default="all", - help="Specify the organisms for which to generate output. " - "You can provide a list of organism names either directly in the command line separated " - "by commas, or by referencing a file containing the list of organism names, " + help="Specify the genomes for which to generate output. " + "You can provide a list of genome names either directly in the command line separated " + "by commas, or by referencing a file containing the list of genome names, " "with one name per line.") optional.add_argument("--add_metadata", @@ -647,12 +647,12 @@ def parser_flat(parser: argparse.ArgumentParser): "used to add sequence information to the output file:") context.add_argument('--fasta', required=False, type=Path, - help="A tab-separated file listing the organism names, and the fasta filepath of its genomic " - "sequence(s) (the fastas can be compressed with gzip). One line per organism.") + help="A tab-separated file listing the genome names, and the fasta filepath of its genomic " + "sequence(s) (the fastas can be compressed with gzip). One line per genome.") context.add_argument('--anno', required=False, type=Path, - help="A tab-separated file listing the organism names, and the gff/gbff filepath of its " - "annotations (the files can be compressed with gzip). One line per organism. " + help="A tab-separated file listing the genome names, and the gff/gbff filepath of its " + "annotations (the files can be compressed with gzip). One line per genome. " "If this is provided, those annotations will be used.") diff --git a/ppanggolin/formats/writeFlatPangenome.py b/ppanggolin/formats/writeFlatPangenome.py index 18fa648a..652db251 100644 --- a/ppanggolin/formats/writeFlatPangenome.py +++ b/ppanggolin/formats/writeFlatPangenome.py @@ -47,7 +47,7 @@ def write_json_header(json: TextIO): """ json.write('{"directed": false, "multigraph": false,') json.write(' "graph": {') - json.write(' "organisms": {') + json.write(' "genomes": {') orgstr = [] for org in pan.organisms: orgstr.append('"' + org.name + '": {') @@ -131,7 +131,7 @@ def write_json_edge(edge: Edge, json: TextIO): """ json.write("{") json.write(f'"weight": {len(edge.gene_pairs)}, "source": "{edge.source.name}", "target": "{edge.target.name}"') - json.write(', "organisms": {') + json.write(', "genomes": {') orgstr = [] for org in edge.organisms: orgstr.append('"' + org.name + '": [') @@ -554,7 +554,7 @@ def summarize_genome(organism: Organism, module_count = "Not computed" if module_count is None else module_count summary_info = { - "Organism_name": organism.name, + "Genome_name": organism.name, "Contigs": organism.number_of_contigs, "Genes": gene_count, "Fragmented_genes": fragmented_genes_count, @@ -699,7 +699,7 @@ def write_stats(output: Path, soft_core: float = 0.95, dup_margin: float = 0.05, summaries.append(organism_summary) - write_summaries_in_tsv(summaries, output_file= output / "organisms_statistics.tsv", dup_margin=dup_margin, soft_core=soft_core) + write_summaries_in_tsv(summaries, output_file= output / "genomes_statistics.tsv", dup_margin=dup_margin, soft_core=soft_core) logging.getLogger("PPanGGOLiN").info("Done writing genome per genome statistics") @@ -861,7 +861,7 @@ def write_module_summary(output: Path, compress: bool = False): """ logging.getLogger("PPanGGOLiN").info("Writing functional modules summary...") with write_compressed_or_not(output / "modules_summary.tsv", compress) as fout: - fout.write("module_id\tnb_families\tnb_organisms\tpartition\tmean_number_of_occurrence\n") + fout.write("module_id\tnb_families\tnb_genomes\tpartition\tmean_number_of_occurrence\n") for mod in pan.modules: org_dict = defaultdict(set) partition_counter = Counter() @@ -901,9 +901,9 @@ def write_org_modules(output: Path, compress: bool = False): :param output: Path to output directory :param compress: Compress the file in .gz """ - logging.getLogger("PPanGGOLiN").info("Writing modules to organisms associations...") - with write_compressed_or_not(output / "modules_in_organisms.tsv", compress) as fout: - fout.write("module_id\torganism\tcompletion\n") + logging.getLogger("PPanGGOLiN").info("Writing modules to genomes associations...") + with write_compressed_or_not(output / "modules_in_genomes.tsv", compress) as fout: + fout.write("module_id\genome\tcompletion\n") for mod in pan.modules: mod_orgs = set() for fam in mod.families: @@ -913,7 +913,7 @@ def write_org_modules(output: Path, compress: bool = False): fout.write(f"module_{mod.ID}\t{org.name}\t{completion:.2}\n") fout.close() logging.getLogger("PPanGGOLiN").info( - f"Done writing modules to organisms associations to: '{output.as_posix() + '/modules_in_organisms.tsv'}'") + f"Done writing modules to genomes associations to: '{output.as_posix() + '/modules_in_genomes.tsv'}'") def write_spot_modules(output: Path, compress: bool = False): @@ -1151,7 +1151,7 @@ def parser_flat(parser: argparse.ArgumentParser): help="Soft core threshold to use") optional.add_argument("--dup_margin", required=False, type=restricted_float, default=0.05, - help="minimum ratio of organisms in which the family must have multiple genes " + help="minimum ratio of genomes in which the family must have multiple genes " "for it to be considered 'duplicated'") diff --git a/ppanggolin/formats/writeMSA.py b/ppanggolin/formats/writeMSA.py index af189edd..ed6936aa 100644 --- a/ppanggolin/formats/writeMSA.py +++ b/ppanggolin/formats/writeMSA.py @@ -381,7 +381,7 @@ def parser_msa(parser: argparse.ArgumentParser): optional.add_argument("--soft_core", required=False, type=restricted_float, default=0.95, help="Soft core threshold to use if 'softcore' partition is chosen") optional.add_argument("--dup_margin", required=False, type=restricted_float, default=0.05, - help="minimum ratio of organisms in which the family must have multiple genes " + help="minimum ratio of genomes in which the family must have multiple genes " "for it to be considered 'duplicated'") optional.add_argument("--single_copy", required=False, action="store_true", default=False, help="Use report gene families that are considered 'single copy', for details see " @@ -394,8 +394,8 @@ def parser_msa(parser: argparse.ArgumentParser): optional.add_argument("--phylo", required=False, action='store_true', help="Writes a whole genome msa file for additional phylogenetic analysis") optional.add_argument("--use_gene_id", required=False, action='store_true', - help="Use gene identifiers rather than organism names for sequences in the family MSA" - " (organism names are used by default)") + help="Use gene identifiers rather than genome names for sequences in the family MSA" + " (genome names are used by default)") optional.add_argument("--translation_table", required=False, default=11, type=int, help="Translation table (genetic code) to use.") optional.add_argument("-c", "--cpu", required=False, default=1, type=int, help="Number of available cpus") diff --git a/ppanggolin/formats/writeSequences.py b/ppanggolin/formats/writeSequences.py index 1559f2d9..4cdb166c 100644 --- a/ppanggolin/formats/writeSequences.py +++ b/ppanggolin/formats/writeSequences.py @@ -268,7 +268,7 @@ def read_genome_file(genome_file: Path, organism: Organism) -> Dict[str, str]: # check_contig_names if set(contig_to_sequence) != {contig.name for contig in organism.contigs}: - raise Exception(f"Contig name inconsistency detected in organism '{organism.name}' between the " + raise Exception(f"Contig name inconsistency detected in genome '{organism.name}' between the " f"information stored in the pangenome file and the contigs found in '{genome_file}'.") return contig_to_sequence @@ -469,11 +469,11 @@ def parser_seq(parser: argparse.ArgumentParser): context = parser.add_argument_group(title="Contextually required arguments", description="With --regions, the following arguments are required:") context.add_argument('--fasta', required=False, type=Path, - help="A tab-separated file listing the organism names, and the fasta filepath of its genomic " - "sequence(s) (the fastas can be compressed with gzip). One line per organism.") + help="A tab-separated file listing the genome names, and the fasta filepath of its genomic " + "sequence(s) (the fastas can be compressed with gzip). One line per genome.") context.add_argument('--anno', required=False, type=Path, - help="A tab-separated file listing the organism names, and the gff/gbff filepath of its " - "annotations (the files can be compressed with gzip). One line per organism. " + help="A tab-separated file listing the genome names, and the gff/gbff filepath of its " + "annotations (the files can be compressed with gzip). One line per genome. " "If this is provided, those annotations will be used.") onereq = parser.add_argument_group(title="Output file", description="At least one of the following argument is required. " diff --git a/ppanggolin/geneFamily.py b/ppanggolin/geneFamily.py index 4cd849bc..130a7d5f 100644 --- a/ppanggolin/geneFamily.py +++ b/ppanggolin/geneFamily.py @@ -383,7 +383,7 @@ def get_genes_per_org(self, org: Organism) -> Generator[Gene, None, None]: if len(self._genePerOrg) == 0: _ = self.get_org_dict() if org not in self._genePerOrg: - raise KeyError(f"Organism don't belong to the gene family: {self.name}") + raise KeyError(f"Genome does not have the gene family: {self.name}") for gene in self._genePerOrg[org]: yield gene diff --git a/ppanggolin/genome.py b/ppanggolin/genome.py index 5e32c6ce..34da2654 100644 --- a/ppanggolin/genome.py +++ b/ppanggolin/genome.py @@ -364,8 +364,8 @@ def __setitem__(self, start: int, gene: Gene): raise TypeError(f"'Gene' type was expected but you provided a '{type(gene)}' type object") if start in self._genes_getter: raise ValueError(f"Gene '{self._genes_getter[start].ID}' with start position {start} already exists in the " - f"contig '{self.name}' {f'from organism {self.organism}' if self.organism else ''}, " - f"cannot add gene '{gene.ID}' {f'from organism {gene.organism}' if gene.organism else ''}") + f"contig '{self.name}' {f'from genome {self.organism}' if self.organism else ''}, " + f"cannot add gene '{gene.ID}' {f'from genome {gene.organism}' if gene.organism else ''}") if gene.position is None: raise AttributeError("The gene object needs to have its position in the contig filled before adding it") # Adding empty values. @@ -708,7 +708,7 @@ def __setitem__(self, name: str, contig: Contig): raise TypeError(f"'Contig' type was expected but you provided a '{type(contig)}' type object") if name in self._contigs_getter: # Add test if contig are equivalent when __eq__ method will be defined in Contig - raise KeyError(f"Contig {contig.name} already in organism {self.name}") + raise KeyError(f"Contig {contig.name} already in genome {self.name}") self._contigs_getter[contig.name] = contig contig.organism = self @@ -727,7 +727,7 @@ def __getitem__(self, name: str) -> Contig: try: return self._contigs_getter[name] except KeyError: - raise KeyError(f"Contig with the name: {name} does not exist in the organism") + raise KeyError(f"Contig with the name: {name} does not exist in the genome") def __delitem__(self, name): """Remove the contig for the given name @@ -832,7 +832,7 @@ def add(self, contig: Contig): except KeyError: self[contig.name] = contig else: - raise KeyError(f"Contig {contig.name} already in organism {self.name}") + raise KeyError(f"Contig {contig.name} already in genome {self.name}") def get(self, name: str) -> Contig: """ diff --git a/ppanggolin/graph/makeGraph.py b/ppanggolin/graph/makeGraph.py index 69557705..c7478cf8 100644 --- a/ppanggolin/graph/makeGraph.py +++ b/ppanggolin/graph/makeGraph.py @@ -86,7 +86,7 @@ def compute_neighbors_graph(pangenome: Pangenome, remove_copy_number: int = 0, remove_high_copy_number(pangenome, remove_copy_number) logging.getLogger("PPanGGOLiN").info("Computing the neighbors graph...") - bar = tqdm(pangenome.organisms, total=pangenome.number_of_organisms, unit="organism", disable=disable_bar) + bar = tqdm(pangenome.organisms, total=pangenome.number_of_organisms, unit="genome", disable=disable_bar) for org in bar: bar.set_description(f"Processing {org.name}") bar.refresh() @@ -150,8 +150,8 @@ def parser_graph(parser: argparse.ArgumentParser): required.add_argument('-p', '--pangenome', required=False, type=Path, help="The pangenome .h5 file") optional = parser.add_argument_group(title="Optional arguments") optional.add_argument('-r', '--remove_high_copy_number', type=int, default=0, - help="Positive Number: Remove families having a number of copy of gene in a single organism " - "above or equal to this threshold in at least one organism " + help="Positive Number: Remove families having a number of copy of gene in a single genome " + "above or equal to this threshold in at least one genome " "(0 or negative values are ignored).") diff --git a/ppanggolin/meta/meta.py b/ppanggolin/meta/meta.py index 865d8af0..feee71d5 100644 --- a/ppanggolin/meta/meta.py +++ b/ppanggolin/meta/meta.py @@ -51,7 +51,7 @@ def check_pangenome_metadata(pangenome: Pangenome, source: str, metatype: str, f erase_pangenome(pangenome, metadata=True, source=source, metatype=metatype) else: raise Exception( - f"An metadata corresponding to the source : '{source}' already exist in pangenome organims." + f"An metadata corresponding to the source : '{source}' already exist in genomes of the pangenome." "Add the option --force to erase") check_pangenome_info(pangenome, disable_bar=disable_bar, **need_dic) diff --git a/ppanggolin/metrics/fluidity.py b/ppanggolin/metrics/fluidity.py index 6c44fcb5..3fd3ce56 100644 --- a/ppanggolin/metrics/fluidity.py +++ b/ppanggolin/metrics/fluidity.py @@ -32,7 +32,7 @@ def gen_fluidity(pangenome: Pangenome, disable_bar: bool = False) -> dict: pangenome.compute_org_bitarrays(part=subset) # Compute binaries corresponding to presence / absence of families in organisms g_sum = 0 - logging.getLogger("PPanGGOLiN").debug("Get number of families in each organisms") + logging.getLogger("PPanGGOLiN").debug("Get number of families in each genomes") org2_nb_fam = nb_fam_per_org(pangenome, disable_bar) logging.getLogger("PPanGGOLiN").info(f"Compute rate of unique family for each genome combination in {subset}") for c_organisms in tqdm(list(combinations(pangenome.organisms, 2)), unit="combination", disable=disable_bar): @@ -54,7 +54,7 @@ def nb_fam_per_org(pangenome: Pangenome, disable_bar: bool = False) -> dict: :return: Dictionary with organisms as key and number of families as value """ org2_nb_fam = dict() - for org in tqdm(pangenome.organisms, unit='organism', disable=disable_bar): + for org in tqdm(pangenome.organisms, unit='genome', disable=disable_bar): org2_nb_fam[org.name] = popcount(org.bitarray) return org2_nb_fam @@ -82,7 +82,7 @@ def fam_fluidity(pangenome: Pangenome, disable_bar: bool = False) -> dict: pangenome.compute_family_bitarrays(part=subset) # Compute binaries corresponding to presence / absence of families in organisms f_sum = 0 - logging.getLogger("PPanGGOLiN").debug("Get number of families in each organisms") + logging.getLogger("PPanGGOLiN").debug("Get number of families in each genome") fam_2_nb_org = nb_org_per_fam(pangenome, disable_bar) logging.getLogger("PPanGGOLiN").info("Compute rate of unique organism for each family combination") for c_fam in tqdm(list(combinations(pangenome.gene_families, 2)), unit="combination", disable=disable_bar): diff --git a/ppanggolin/mod/module.py b/ppanggolin/mod/module.py index df3c8c5b..278951c4 100644 --- a/ppanggolin/mod/module.py +++ b/ppanggolin/mod/module.py @@ -177,7 +177,7 @@ def parser_module(parser: argparse.ArgumentParser): optional.add_argument("--size", required=False, type=int, default=3, help="Minimal number of gene family in a module") optional.add_argument("--dup_margin", required=False, type=restricted_float, default=0.05, - help="minimum ratio of organisms in which the family must have multiple genes" + help="minimum ratio of genomes in which the family must have multiple genes" " for it to be considered 'duplicated'") optional.add_argument("-m", "--min_presence", required=False, type=int, default=2, help="Minimum number of times the module needs to be present in the pangenome to be reported." diff --git a/ppanggolin/nem/partition.py b/ppanggolin/nem/partition.py index dfa852a4..a89e9a3f 100644 --- a/ppanggolin/nem/partition.py +++ b/ppanggolin/nem/partition.py @@ -168,7 +168,7 @@ def run_partitioning(nem_dir_path: Path, nb_org: int, beta: float = 2.5, free_di partitions_list[i] = parti[positions_max_prob.pop()] except IOError: logging.getLogger("PPanGGOLiN").debug( - "partitioning did not work (the number of organisms used is probably too low), " + "partitioning did not work (the number of genomes used is probably too low), " "see logs here to obtain more details " + nem_dir_path.as_posix() + "/nem_file_" + str(kval) + ".log") return {}, None, None # return empty objects @@ -474,8 +474,8 @@ def partition(pangenome: Pangenome, output: Path = None, beta: float = 2.5, sm_d tmp_path = Path(tmp_dir.name) if len(organisms) <= 10: - logging.getLogger("PPanGGOLiN").warning(f"The number of selected organisms is too low ({len(organisms)} " - f"organisms used) to robustly partition the graph") + logging.getLogger("PPanGGOLiN").warning(f"The number of selected genomes is too low ({len(organisms)} " + f"genomes used) to robustly partition the graph") pangenome.parameters["partition"] = {} pangenome.parameters["partition"]["beta"] = beta @@ -658,9 +658,9 @@ def parser_partition(parser: argparse.ArgumentParser): help="Output directory") optional.add_argument("-fd", "--free_dispersion", required=False, default=False, action="store_true", help="use if the dispersion around the centroid vector of each partition during must be free." - " It will be the same for all organisms by default.") + " It will be the same for all genomes by default.") optional.add_argument("-ck", "--chunk_size", required=False, default=500, type=int, - help="Size of the chunks when performing partitioning using chunks of organisms. " + help="Size of the chunks when performing partitioning using chunks of genomes. " "Chunk partitioning will be used automatically " "if the number of genomes is above this number.") optional.add_argument("-K", "--nb_of_partitions", required=False, default=-1, type=int, diff --git a/ppanggolin/nem/rarefaction.py b/ppanggolin/nem/rarefaction.py index af2e33b6..f6e687d8 100644 --- a/ppanggolin/nem/rarefaction.py +++ b/ppanggolin/nem/rarefaction.py @@ -163,7 +163,7 @@ def draw_curve(output: Path, data: list, max_sampling: int = 10): logging.getLogger("PPanGGOLiN").info("Drawing the rarefaction curve ...") raref_name = output/"rarefaction.csv" raref = open(raref_name, "w") - raref.write(",".join(["nb_org", "persistent", "shell", "cloud", "undefined", "exact_core", "exact_accessory", + raref.write(",".join(["genomes_count", "persistent", "shell", "cloud", "undefined", "exact_core", "exact_accessory", "soft_core", "soft_accessory", "pangenome", "K"]) + "\n") for part in data: raref.write(",".join(map(str, [part["nborgs"], part["persistent"], part["shell"], part["cloud"], @@ -185,17 +185,17 @@ def poly_area(p_x: list, p_y: list) -> float: params_file.write("partition,kappa,gamma,kappa_std_error,gamma_std_error,IQR_area\n") for partition in ["persistent", "shell", "cloud", "undefined", "exact_core", "exact_accessory", "soft_core", "soft_accessory", "pangenome"]: - percentiles_75 = Series({i: numpy.nanpercentile(data_raref[data_raref["nb_org"] == i][partition], 75) for i in + percentiles_75 = Series({i: numpy.nanpercentile(data_raref[data_raref["genomes_count"] == i][partition], 75) for i in range(1, max_sampling + 1)}).dropna() - percentiles_25 = Series({i: numpy.nanpercentile(data_raref[data_raref["nb_org"] == i][partition], 25) for i in + percentiles_25 = Series({i: numpy.nanpercentile(data_raref[data_raref["genomes_count"] == i][partition], 25) for i in range(1, max_sampling + 1)}).dropna() - mins = Series({i: numpy.min(data_raref[data_raref["nb_org"] == i][partition]) for i in + mins = Series({i: numpy.min(data_raref[data_raref["genomes_count"] == i][partition]) for i in range(1, max_sampling + 1)}).dropna() - maxs = Series({i: numpy.max(data_raref[data_raref["nb_org"] == i][partition]) for i in + maxs = Series({i: numpy.max(data_raref[data_raref["genomes_count"] == i][partition]) for i in range(1, max_sampling + 1)}).dropna() - medians = Series({i: numpy.median(data_raref[data_raref["nb_org"] == i][partition]) for i in + medians = Series({i: numpy.median(data_raref[data_raref["genomes_count"] == i][partition]) for i in range(1, max_sampling + 1)}).dropna() - means = Series({i: numpy.mean(data_raref[data_raref["nb_org"] == i][partition]) for i in + means = Series({i: numpy.mean(data_raref[data_raref["genomes_count"] == i][partition]) for i in range(1, max_sampling + 1)}).dropna() initial_kappa_gamma = numpy.array([0.0, 0.0]) x = percentiles_25.index.tolist() @@ -206,8 +206,8 @@ def poly_area(p_x: list, p_y: list) -> float: "soft_accessory": "#996633", "shell": "#00D860", "persistent": "#F7A507", "cloud": "#79DEFF", "undefined": "#828282"} try: - all_values = data_raref[data_raref["nb_org"] > nb_org_min_fitting][partition].dropna() - res = optimization.curve_fit(heap_law, data_raref.loc[all_values.index]["nb_org"], all_values, + all_values = data_raref[data_raref["genomes_count"] > nb_org_min_fitting][partition].dropna() + res = optimization.curve_fit(heap_law, data_raref.loc[all_values.index]["genomes_count"], all_values, initial_kappa_gamma) kappa, gamma = res[0] error_k, error_g = numpy.sqrt(numpy.diag(res[1])) # to calculate the fitting error. @@ -392,7 +392,7 @@ def make_rarefaction_curve(pangenome: Pangenome, output: Path, tmpdir: Path = No for i in range(min_sampling, max_sampling): # each point for _ in range(depth): # number of samples per points all_samples.append(set(random.sample(set(pangenome.organisms), i + 1))) - logging.getLogger("PPanGGOLiN").info(f"Done sampling organisms in the pan, there are {len(all_samples)} samples") + logging.getLogger("PPanGGOLiN").info(f"Done sampling genomes in the pan, there are {len(all_samples)} samples") samp_nb_per_part = [] logging.getLogger("PPanGGOLiN").info("Computing bitarrays for each family...") diff --git a/ppanggolin/pangenome.py b/ppanggolin/pangenome.py index feb76ca3..6f073bf8 100644 --- a/ppanggolin/pangenome.py +++ b/ppanggolin/pangenome.py @@ -333,7 +333,7 @@ def _mk_contig_getter(self, check_name: bool = False, name: str = ""): if check_name: if contig.name in names: raise KeyError("Two contigs with the same name. " - "You should use the contig ID or give the organism name") + "You should use the contig ID or give the genome name") names.add(contig.name) if contig.name == name: identifier = contig.ID @@ -368,13 +368,13 @@ def get_contig(self, identifier: int = None, name: str = None, organism_name: st :raises KeyError: If the `contig` is not in the pangenome """ assert not all(x is None for x in [identifier, name, organism_name]), ("You must provide either contig_id or " - "name or organism_name") + "name or genome_name") if name: if not isinstance(name, str): raise AssertionError("Contig name should be a string") if organism_name: if not isinstance(organism_name, str): - raise AssertionError("Organism name should be a string") + raise AssertionError("Genome name should be a string") organism = self.get_organism(organism_name) return organism.get(name) else: @@ -395,7 +395,7 @@ def get_organism(self, name: str) -> Organism: :raise AssertionError: If the organism name is not a string :raises KeyError: If the provided name is not an organism in the pangenome """ - assert isinstance(name, str), "Organism name should be a string" + assert isinstance(name, str), "Genome name should be a string" try: return self._org_getter[name] except KeyError: @@ -419,8 +419,8 @@ def add_organism(self, organism: Organism): except KeyError: self._org_getter[organism.name] = organism else: - raise KeyError(f"Redondant organism name was found ({organism.name})." - f"All of your organisms must have unique names.") + raise KeyError(f"Redondant genome name was found ({organism.name})." + f"All of your genomes must have unique names.") def get_org_index(self) -> Dict[Organism, int]: # will not make a new index if it exists already """Creates an index for Organisms (each organism is assigned an Integer). diff --git a/ppanggolin/projection/projection.py b/ppanggolin/projection/projection.py index 96038324..a08eb19e 100644 --- a/ppanggolin/projection/projection.py +++ b/ppanggolin/projection/projection.py @@ -107,7 +107,7 @@ def check_pangenome_for_projection(pangenome: Pangenome, fast_aln:bool): if pangenome.status["geneFamilySequences"] not in ["Loaded", "Computed", "inFile"]: raise Exception("The provided pangenome has no gene families sequences. " - "This is not possible to annotate an input organism to this pangenome.") + "This is not possible to annotate an input genome to this pangenome.") return predict_rgp, project_spots, project_modules @@ -154,9 +154,9 @@ def manage_input_genomes_annotation(pangenome, input_mode, anno, fasta, else: raise ValueError(f"You provided GFF files for {len(organisms_with_no_fasta)} (out of {len(organisms)}) " - "organisms without associated sequence data, and you did not provide " + "genomes without associated sequence data, and you did not provide " "FASTA sequences using the --fasta or --single_fasta_file options. Therefore, it is impossible to project the pangenome onto the input genomes. " - f"The following organisms have no associated sequence data: {', '.join(o.name for o in organisms_with_no_fasta)}") + f"The following genomes have no associated sequence data: {', '.join(o.name for o in organisms_with_no_fasta)}") elif input_type == "fasta": annotate_param_names = ["norna", "kingdom", @@ -453,8 +453,8 @@ def get_gene_sequences_from_fasta_files(organisms, genome_name_to_annot_path): if org_names & set(genome_name_to_annot_path) != org_names: missing = len(org_names - set(genome_name_to_annot_path)) - raise ValueError(f"You did not provided fasta for all the organisms found in annotation file. " - f"{missing} are missing (out of {len(organisms)}). Missing organisms: {','.join(missing)}") + raise ValueError(f"You did not provided fasta for all the genomes found in annotation file. " + f"{missing} are missing (out of {len(organisms)}). Missing genomes: {','.join(missing)}") for org in organisms: @@ -467,7 +467,7 @@ def get_gene_sequences_from_fasta_files(organisms, genome_name_to_annot_path): try: contig_seq = org_contig_to_seq[contig.name] except KeyError: - msg = f"Fasta file for organism {org.name} did not have the contig {contig.name} " \ + msg = f"Fasta file for genome {org.name} did not have the contig {contig.name} " \ f"that was read from the annotation file. " msg += f"The provided contigs in the fasta were : " \ f"{', '.join([contig for contig in org_contig_to_seq])}." @@ -491,7 +491,7 @@ def check_input_names(pangenome, input_names): duplicated_names = set(input_names) & {org.name for org in pangenome.organisms} if len(duplicated_names) != 0: raise NameError( - f"{len(duplicated_names)} provided organism names already exist in the given pangenome: {' '.join(duplicated_names)}") + f"{len(duplicated_names)} provided genome name(s) already exist in the given pangenome: {' '.join(duplicated_names)}") def write_summary_in_yaml(summary_info: Dict[str, Any], output_file: Path): @@ -601,9 +601,9 @@ def annotate_input_genes_with_pangenome_families(pangenome: Pangenome, input_org # gene id already exists. new_name = f"{input_organism.name}_{gene_id}" logging.getLogger('PPanGGOLiN').warning( - 'The input organism as a specific gene that does not align to any ' + 'The input genome as a specific gene that does not align to any ' f'pangenome families with the same id ({gene_id}) than an existing gene family in the pangenome. ' - f'The organism name is added to the family name: {new_name}') + f'The genome name is added to the family name: {new_name}') new_gene_family = GeneFamily(pangenome.max_fam_id, new_name) pangenome.add_gene_family(new_gene_family) @@ -673,7 +673,7 @@ def write_predicted_regions(regions: Set[Region], """ fname = output / "plastic_regions.tsv" with write_compressed_or_not(fname, compress) as tab: - fieldnames = ["region", "organism", "contig", "start", + fieldnames = ["region", "genome", "contig", "start", "stop", "genes", "contigBorder", "wholeContig"] writer = csv.DictWriter(tab, fieldnames=fieldnames, delimiter='\t') @@ -684,7 +684,7 @@ def write_predicted_regions(regions: Set[Region], for region in regions: row = { "region": region.name, - "organism": region.organism, + "genome": region.organism, "contig": region.contig, "start": region.starter, "stop": region.stopper, @@ -746,7 +746,7 @@ def retrieve_gene_sequences_from_fasta_file(input_organism, fasta_file): for rna in contig.RNAs: rna.add_dna(get_dna_sequence(contig_id2deq[contig.name], rna)) except KeyError: - msg = f"Fasta file for input_organism {input_organism.name} did not have the contig {contig.name} " \ + msg = f"Fasta file for input genome {input_organism.name} did not have the contig {contig.name} " \ f"that was read from the annotation file. " msg += f"The provided contigs in the fasta were : " \ f"{', '.join([contig for contig in contig_id2deq.keys()])}." @@ -970,7 +970,7 @@ def predict_spot_in_one_organism( input_org_node_to_rgps[border_node].add(rgp) if len(input_org_node_to_rgps) == 0: - logging.getLogger("PPanGGOLiN").debug(f"{organism_name}: no RGPs of the input organism will be associated with any spot of insertion " + logging.getLogger("PPanGGOLiN").debug(f"{organism_name}: no RGPs of the input genome will be associated with any spot of insertion " "as they are on a contig border (or have " f"less than {set_size} persistent gene families until the contig border). " "Projection of spots stops here") @@ -984,7 +984,7 @@ def predict_spot_in_one_organism( f"less than {set_size} persistent gene families until the contig border)") logging.getLogger("PPanGGOLiN").debug( - f"{organism_name}: {used} RGPs of the input organism will be associated to a spot of insertion") + f"{organism_name}: {used} RGPs of the input genome will be associated to a spot of insertion") # add potential edges from new nodes to the rest of the nodes all_nodes = list(graph_spot.nodes) @@ -1024,7 +1024,7 @@ def predict_spot_in_one_organism( elif len(spots_of_the_cc) > 1: # more than one spot in the cc - logging.getLogger("PPanGGOLiN").debug(f'{organism_name}: Some RGPs of the input organism ' + logging.getLogger("PPanGGOLiN").debug(f'{organism_name}: Some RGPs of the input genome ' f"are connected to {len(spots_of_the_cc)} original spots of the pangenome.") input_rgps_of_the_cc = set() @@ -1037,7 +1037,7 @@ def predict_spot_in_one_organism( graph_spot.nodes[node]["spot_id"] = ';'.join( (str(spot) for spot in spots_of_the_cc)) - graph_spot.nodes[node]["includes_RGPs_from_the_input_organism"] = True + graph_spot.nodes[node]["includes_RGPs_from_the_input_genome"] = True for spot in spots_of_the_cc: for region in input_rgps_of_the_cc: @@ -1055,7 +1055,7 @@ def predict_spot_in_one_organism( file_basename='projected_spotGraph') write_rgp_to_spot_table(input_rgp_to_spots, output=output, - filename='input_organism_rgp_to_spot.tsv') + filename='input_genome_rgp_to_spot.tsv') input_org_spots = {spot for spots in input_rgp_to_spots.values() for spot in spots } @@ -1083,13 +1083,13 @@ def project_and_write_modules(pangenome: Pangenome, input_organisms: Iterable[Or """ input_orgs_to_modules = {} for input_organism in input_organisms: - output_file = output / input_organism.name / "modules_in_input_organism.tsv" + output_file = output / input_organism.name / "modules_in_input_genome.tsv" input_organism_families = list(input_organism.families) counter = 0 modules_in_input_org = [] with write_compressed_or_not(output_file, compress) as fout: - fout.write("module_id\torganism\tcompletion\n") + fout.write("module_id\tgenome\tcompletion\n") for mod in pangenome.modules: module_in_input_organism = any( @@ -1231,7 +1231,7 @@ def launch(args: argparse.Namespace): organisms, genome_name_to_path, input_type = manage_input_genomes_annotation(pangenome=pangenome, input_mode=args.input_mode, anno=args.anno, fasta=args.fasta, - organism_name=args.organism_name, + organism_name=args.genome_name, circular_contigs=args.circular_contigs, pangenome_params=pangenome_params, cpu=args.cpu, use_pseudo=args.use_pseudo, @@ -1315,19 +1315,19 @@ def parser_projection(parser: argparse.ArgumentParser): type=Path, help="The pangenome.h5 file") required.add_argument('--fasta', required=False, type=Path, - help="Specify a FASTA file containing the genomic sequences of the organism(s) you wish to annotate, " - "or provide a tab-separated file listing organism names alongside their respective FASTA filepaths, with one line per organism.") + help="Specify a FASTA file containing the genomic sequences of the genome(s) you wish to annotate, " + "or provide a tab-separated file listing genome names alongside their respective FASTA filepaths, with one line per genome.") required.add_argument('--anno', required=False, type=Path, help="Specify an annotation file in GFF/GBFF format for the genome you wish to annotate. " - "Alternatively, you can provide a tab-separated file listing organism names alongside their respective annotation filepaths, " - "with one line per organism. If both an annotation file and a FASTA file are provided, the annotation file will take precedence.") + "Alternatively, you can provide a tab-separated file listing genome names alongside their respective annotation filepaths, " + "with one line per genome. If both an annotation file and a FASTA file are provided, the annotation file will take precedence.") required_single = parser.add_argument_group(title="Single Genome Arguments", description="Use these options when providing a single FASTA or annotation file:") - required_single.add_argument("-n", '--organism_name', required=False, type=str, default="input_genome", - help="Specify the name of the organism whose genome you want to annotate when providing a single FASTA or annotation file.") + required_single.add_argument("-n", '--genome_name', required=False, type=str, default="input_genome", + help="Specify the name of the genome whose genome you want to annotate when providing a single FASTA or annotation file.") required_single.add_argument('--circular_contigs', nargs="+", required=False, type=tuple, help="Specify the contigs of the input genome that should be treated as circular when providing a single FASTA or annotation file.") @@ -1358,31 +1358,31 @@ def parser_projection(parser: argparse.ArgumentParser): "(Default behavior is to ignore them)") optional.add_argument("--dup_margin", required=False, type=restricted_float, default=0.05, - help="minimum ratio of organisms in which the family must have multiple genes " + help="minimum ratio of genomes in which the family must have multiple genes " "for it to be considered 'duplicated'. " "This metric is used to compute completeness and duplication of the input genomes") optional.add_argument("--soft_core", required=False, type=restricted_float, default=0.95, help="Soft core threshold used when generating general statistics on the projected genome. " "This threshold does not influence PPanGGOLiN's partitioning. " - "The value determines the minimum fraction of organisms that must possess a gene family " + "The value determines the minimum fraction of genomes that must possess a gene family " "for it to be considered part of the soft core.") optional.add_argument("--spot_graph", required=False, action="store_true", help="Write the spot graph to a file, with pairs of blocks of single copy markers flanking RGPs " - "as nodes. This graph can be used to visualize nodes that have RGPs from the input organism.") + "as nodes. This graph can be used to visualize nodes that have RGPs from the input genome.") optional.add_argument('--graph_formats', required=False, type=str, choices=['gexf', "graphml"], nargs="+", default=['gexf'], help="Format of the output graph.") optional.add_argument("--gff", required=False, action="store_true", - help="Generate GFF files with projected pangenome annotations for each input organism.") + help="Generate GFF files with projected pangenome annotations for each input genome.") optional.add_argument("--proksee", required=False, action="store_true", - help="Generate JSON map files for PROKSEE with projected pangenome annotations for each input organism.") + help="Generate JSON map files for PROKSEE with projected pangenome annotations for each input genome.") optional.add_argument("--table", required=False, action="store_true", - help="Generate a tsv file for each input organism with pangenome annotations.") + help="Generate a tsv file for each input genome with pangenome annotations.") optional.add_argument("--compress", required=False, action="store_true", help="Compress the files in .gz") diff --git a/ppanggolin/region.py b/ppanggolin/region.py index ae38f7c6..f1fede87 100644 --- a/ppanggolin/region.py +++ b/ppanggolin/region.py @@ -109,7 +109,7 @@ def __setitem__(self, position: int, gene: Gene): """ if len(self) > 0: if gene.organism != self.organism: - raise Exception(f"Gene {gene.name} is from a different organism than the first defined in RGP. " + raise Exception(f"Gene {gene.name} is from a different genome than the first defined in RGP. " "That's not possible") if gene.contig != self.contig: raise Exception(f"Gene {gene.name} is from a different contig than the first defined in RGP. " diff --git a/ppanggolin/utils.py b/ppanggolin/utils.py index ad4b8159..13b03bac 100755 --- a/ppanggolin/utils.py +++ b/ppanggolin/utils.py @@ -29,7 +29,7 @@ # all input params that exists in ppanggolin ALL_INPUT_PARAMS = ['fasta', 'anno', 'clusters', 'pangenome', - "fasta_file", "annot_file", "organism_name"] # the last three params is for projection cmd + "fasta_file", "annot_file", "genome_name"] # the last three params is for projection cmd # all params that should be in the general_parameters section of the config file ALL_GENERAL_PARAMS = ['output', 'basename', 'rarefaction', 'no_flat_files', 'tmpdir', 'verbose', 'log', @@ -1012,7 +1012,7 @@ def parse_input_paths_file(path_list_file: Path) -> Dict[str, Dict[str, Union[Pa :raises FileNotFoundError: If a specified genome file path does not exist. :raises Exception: If there are no genomes in the provided file. """ - logging.getLogger("PPanGGOLiN").info(f"Reading {path_list_file} to process organism files") + logging.getLogger("PPanGGOLiN").info(f"Reading {path_list_file} to process genome files") genome_name_to_genome_path = {} for line in read_compressed_or_not(path_list_file): diff --git a/ppanggolin/workflow/all.py b/ppanggolin/workflow/all.py index 593c2a4a..54842d24 100644 --- a/ppanggolin/workflow/all.py +++ b/ppanggolin/workflow/all.py @@ -192,7 +192,7 @@ def launch_workflow(args: argparse.Namespace, panrgp: bool = True, draw_tile_plot(pangenome, args.output, nocloud=nocloud, disable_bar=args.disable_prog_bar) else: logging.getLogger("PPanGGOLiN").warning( - 'Tile plot output have been requested but there are too many organisms to produce a viewable tile plot.') + 'Tile plot output have been requested but there are too many genomes to produce a viewable tile plot.') if args.draw.ucurve: draw_ucurve(pangenome, args.output, disable_bar=args.disable_prog_bar, soft_core=args.draw.soft_core) @@ -300,13 +300,13 @@ def add_workflow_args(parser: argparse.ArgumentParser): required = parser.add_argument_group(title="Input arguments", description="The possible input arguments :") required.add_argument('--fasta', required=False, type=Path, - help="A tab-separated file listing the organism names, " + help="A tab-separated file listing the genome names, " "and the fasta filepath of its genomic sequence(s) (the fastas can be compressed). " - "One line per organism. This option can be used alone.") + "One line per genome. This option can be used alone.") required.add_argument('--anno', required=False, type=Path, - help="A tab-separated file listing the organism names, and the gff filepath of " - "its annotations (the gffs can be compressed). One line per organism. " + help="A tab-separated file listing the genome names, and the gff filepath of " + "its annotations (the gffs can be compressed). One line per genome. " "This option can be used alone IF the fasta sequences are in the gff files, " "otherwise --fasta needs to be used.") diff --git a/testingDataset/organisms.fasta.list b/testingDataset/genomes.fasta.list similarity index 100% rename from testingDataset/organisms.fasta.list rename to testingDataset/genomes.fasta.list diff --git a/testingDataset/organisms.gbff.list b/testingDataset/genomes.gbff.list similarity index 100% rename from testingDataset/organisms.gbff.list rename to testingDataset/genomes.gbff.list