Skip to content

Commit

Permalink
replace organism by genome
Browse files Browse the repository at this point in the history
  • Loading branch information
JeanMainguy committed Dec 4, 2023
1 parent 389926e commit c19a600
Show file tree
Hide file tree
Showing 36 changed files with 180 additions and 179 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/check_recipes.yml
Original file line number Diff line number Diff line change
Expand Up @@ -42,5 +42,5 @@ jobs:
shell: bash -l {0}
run: |
cd testingDataset/
ppanggolin all --cpu 1 --anno organisms.gbff.list -o pango
ppanggolin all --cpu 1 --anno genomes.gbff.list -o pango
ppanggolin info -p pango/pangenome.h5 --content --parameters --status
38 changes: 19 additions & 19 deletions .github/workflows/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ jobs:
shell: bash -l {0}
run: |
cd testingDataset
ppanggolin all --cpu 1 --fasta organisms.fasta.list --output mybasicpangenome
ppanggolin all --cpu 1 --fasta genomes.fasta.list --output mybasicpangenome
ppanggolin info --pangenome mybasicpangenome/pangenome.h5 --content --parameters --status
cd -
# test most options calls. If there is a change in the API somewhere that was not taken into account (whether in the options for the users, or the classes for the devs), this should fail, otherwise everything is probably good.
Expand All @@ -59,7 +59,7 @@ jobs:
shell: bash -l {0}
run: |
cd testingDataset
ppanggolin annotate --fasta organisms.fasta.list --output stepbystep --kingdom bacteria
ppanggolin annotate --fasta genomes.fasta.list --output stepbystep --kingdom bacteria
ppanggolin cluster -p stepbystep/pangenome.h5 --coverage 0.8 --identity 0.8
ppanggolin graph -p stepbystep/pangenome.h5 -r 10
ppanggolin partition --output stepbystep -f -p stepbystep/pangenome.h5 --cpu 1 -b 2.6 -ms 10 -fd -ck 500 -Kmm 3 12 -im 0.04 --draw_ICL -se $RANDOM
Expand All @@ -70,24 +70,24 @@ jobs:
ppanggolin draw -p stepbystep/pangenome.h5 --draw_spots -o stepbystep -f
ppanggolin module -p stepbystep/pangenome.h5 --transitive 4 --size 3 --jaccard 0.86 --dup_margin 0.05
ppanggolin write_pangenome -p stepbystep/pangenome.h5 --output stepbystep -f --soft_core 0.9 --dup_margin 0.06 --gexf --light_gexf --csv --Rtab --stats --partitions --compress --json --spots --borders --families_tsv --cpu 1
ppanggolin write_genomes -p stepbystep/pangenome.h5 --output stepbystep -f --fasta organisms.fasta.list --gff --proksee --table
ppanggolin fasta -p stepbystep/pangenome.h5 --output stepbystep -f --prot_families all --gene_families shell --regions all --fasta organisms.fasta.list
ppanggolin write_genomes -p stepbystep/pangenome.h5 --output stepbystep -f --fasta genomes.fasta.list --gff --proksee --table
ppanggolin fasta -p stepbystep/pangenome.h5 --output stepbystep -f --prot_families all --gene_families shell --regions all --fasta genomes.fasta.list
ppanggolin draw -p stepbystep/pangenome.h5 --draw_spots --spots all -o stepbystep -f
ppanggolin metrics -p stepbystep/pangenome.h5 --genome_fluidity --info_modules --no_print_info -f --log metrics.log
cd -
- name: gbff parsing and MSA computing
shell: bash -l {0}
run: |
cd testingDataset
ppanggolin workflow --cpu 1 --anno organisms.gbff.list --output myannopang
ppanggolin workflow --cpu 1 --anno genomes.gbff.list --output myannopang
ppanggolin msa --pangenome myannopang/pangenome.h5 --source dna --partition core -o myannopang/ -f --use_gene_id --phylo --single_copy
cd -
- name: clusters reading from external file
shell: bash -l {0}
run: |
cd testingDataset
ppanggolin panrgp --anno organisms.gbff.list --cluster clusters.tsv --output readclusterpang
ppanggolin annotate --anno organisms.gbff.list --output readclusters
ppanggolin panrgp --anno genomes.gbff.list --cluster clusters.tsv --output readclusterpang
ppanggolin annotate --anno genomes.gbff.list --output readclusters
ppanggolin cluster --clusters clusters.tsv -p readclusters/pangenome.h5
ppanggolin msa --pangenome readclusterpang/pangenome.h5 --partition persistent --phylo -o readclusterpang/msa/ -f
cd -
Expand Down Expand Up @@ -137,40 +137,40 @@ jobs:
run: |
cd testingDataset
ppanggolin utils --default_config panrgp -o panrgp_default_config.yaml
ppanggolin panrgp --anno organisms.gbff.list --cluster clusters.tsv -o test_config --config panrgp_default_config.yaml
ppanggolin panrgp --anno genomes.gbff.list --cluster clusters.tsv -o test_config --config panrgp_default_config.yaml
cd -
- name: testing projection cmd
shell: bash -l {0}
run: |
cd testingDataset
head organisms.gbff.list | sed 's/^/input_org_/g' > organisms.gbff.head.list
ppanggolin projection --pangenome stepbystep/pangenome.h5 -o projection_from_list_of_gbff --anno organisms.gbff.head.list --gff --proksee
head genomes.gbff.list | sed 's/^/input_genome_/g' > genomes.gbff.head.list
ppanggolin projection --pangenome stepbystep/pangenome.h5 -o projection_from_list_of_gbff --anno genomes.gbff.head.list --gff --proksee
ppanggolin projection --pangenome mybasicpangenome/pangenome.h5 -o projection_from_single_fasta \
--organism_name chlam_A --fasta FASTA/GCF_002776845.1_ASM277684v1_genomic.fna.gz \
--genome_name chlam_A --fasta FASTA/GCF_002776845.1_ASM277684v1_genomic.fna.gz \
--spot_graph --graph_formats graphml --fast --keep_tmp -f --add_sequences --gff --proksee --table --add_metadata
ppanggolin projection --pangenome mybasicpangenome/pangenome.h5 -o projection_from_gff_prodigal \
--organism_name chlam_annotated_with_prodigal --anno GBFF/GCF_003788785.1_ct114V1_genomic_prodigal_annotation.gff.gz \
--genome_name chlam_annotated_with_prodigal --anno GBFF/GCF_003788785.1_ct114V1_genomic_prodigal_annotation.gff.gz \
--gff --table
- name: testing write_genome_cmds
shell: bash -l {0}
run: |
cd testingDataset
head organisms.gbff.list | cut -f1 > organisms_names.gbff.head.list
head genomes.gbff.list | cut -f1 > genome_names.gbff.head.list
ppanggolin write_genomes -p myannopang/pangenome.h5 --output flat_genomes_from_file_org -f \
--anno organisms.gbff.list --gff --table --organisms organisms_names.gbff.head.list
ppanggolin write_genomes -p myannopang/pangenome.h5 --output flat_genomes_from_genome_files -f \
--anno genomes.gbff.list --gff --table --genomes genome_names.gbff.head.list
ppanggolin write_genomes -p stepbystep/pangenome.h5 --output flat_genomes_from_cmdline_orgs --proksee \
--organisms GCF_006508185.1_ASM650818v1_genomic,GCF_002088315.1_ASM208831v1_genomic
ppanggolin write_genomes -p stepbystep/pangenome.h5 --output flat_genomes_from_cmdline_genomes --proksee \
--genomes GCF_006508185.1_ASM650818v1_genomic,GCF_002088315.1_ASM208831v1_genomic
head organisms.fasta.list | cut -f1 > organisms_names.fasta.head.list
head genomes.fasta.list | cut -f1 > genome_names.fasta.head.list
# Default separator is a pipe but a pipe is found in a value of metadata db1. That is why we use another separator here.
ppanggolin write_genomes -p mybasicpangenome/pangenome.h5 --output mybasicpangenome/genomes_outputs \
--organisms organisms_names.fasta.head.list \
--genomes genome_names.fasta.head.list \
-f --gff --add_metadata --table --metadata_sep §
# Pipe separatore is found in metadata source db1. if we don't require this source then the writting with pipe is work fine.
Expand Down
4 changes: 2 additions & 2 deletions docs/user/Modules/moduleOutputs.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
## Module outputs

### Functional modules
This .tsv file lists the modules and the gene families that belong to them. It lists one family per line, and there are multiple line for each module.
This `.tsv` file lists the modules and the gene families that belong to them. It lists one family per line, and there are multiple line for each module.
It is written along with other files with the following command:
`ppanggolin write_pangenome -p pangenome.h5 --modules`

Expand Down Expand Up @@ -54,7 +54,7 @@ The format of the 'modules_spots.tsv' file is the following:
|module_id| The module identifier|
|spot_id| the spot identifier|

The file 'modules_RGP_lists.tsv' lists RGPs that have the same modules. Those RGPs can have different gene families, however they will not have any other module than those that are indicated. The format of the 'modules_RGP_lists.tsv' is the following:
The file `modules_RGP_lists.tsv` lists RGPs that have the same modules. Those RGPs can have different gene families, however they will not have any other module than those that are indicated. The format of the 'modules_RGP_lists.tsv' is the following:

|column|description|
|------|------------|
Expand Down
7 changes: 4 additions & 3 deletions docs/user/RGP/rgpOutputs.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ The file has the following format :
| column | description |
|--------|-------------|
| region | a unique identifier for the region. This is usually built from the contig it is on, with a number after it|
|organism| the organism it is in. This is the organism name provided by the user.|
|genome| the genome it is in. This is the genome name provided by the user.|
|start| the start position of the RGP in the contig|
|stop| the stop position of the RGP in the contig|
|genes| the number of genes included in the RGP|
Expand Down Expand Up @@ -72,7 +72,7 @@ For versions 1.2.30 and above, the 'draw' command can draw specific spots of int
It will also write a gexf file, which corresponds to the gene families and their organization within the spots. It is basically a subgraph of the pangenome, consisting of the spot itself.
The command can be used as such:

`ppanggolin draw -p pangenome.h5 --spots all` will draw an interactive .html figure and a gexf file for all the spots.
`ppanggolin draw -p pangenome.h5 --spots all` will draw an interactive `.html` figure and a `gexf` graph file for all the spots.

If you are interested in only a single spot, you can use its identifier to draw it, as such:

Expand All @@ -86,7 +86,8 @@ The interactive figures that are drawn look like this:



The plot represents the different gene organizations that are found in the spot. If there are RGPs with identical gene organization, the organization is represented only once (the represented RGP is picked at random among all identical RGPs). The list of RGPs with the same organization is accessible in the file written alongside the figure called 'spot_X_identical_rgps.tsv', with X the spot_id.
The plot represents the different gene organizations that are found in the spot. If there are RGPs with identical gene organization, the organization is represented only once (the represented RGP is picked at random among all identical RGPs). The list of RGPs with the same organization is accessible in the file written alongside the figure called `spot_X_identical_rgps.tsv`, with X the spot_id.

They can be edited using the sliders and the radio buttons, to change various graphical parameters, and then the plot itself can be saved using the save button on the right of the screen, if need be.

For the gexf file, you can see how to visualize it in the section about the [pangenome gexf](../PangenomeAnalyses/pangenomeGraphOut.md#pangenome-graph-output).
14 changes: 7 additions & 7 deletions docs/user/genomicContext.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,22 +24,22 @@ The second possibility is to give a list of gene families ID used to compute the

This will search the common connected components in the computed pangenome and export the result in a tsv file.

In this case, you can give a pangenome without gene families representatives sequences. This option is compatible with a pangenome computed with an external clustering (see the [cluster](PangenomeAnalyses/pangenomeBuild.md#clustering) subcommand).
In this case, you can give a pangenome without gene families representatives sequences. This option is compatible with a pangenome computed with an external clustering (see the [cluster](./PangenomeAnalyses/pangenomeCluster.md) subcommand).

## Output format

In case of you are using families ID, you will only have as output the `gene_context.tsv` file. In the other case, you use sequences, you will have another output file to report the alignment between sequences and pangenome families (see detail in align subcommand).

There are 6 columns in `gene_context.tsv`.

1. **geneContext ID**: Identifier of the found context. It is incrementally generated, beginning with 1
2. **Gene family name**: Identifier of the gene family, from the pangenome, correspond to the found context
3. **Sequence ID**: Identifier of the searched sequence in the pangenome
4. **Nb Genomes**: Number of genomes where the genomic context is found
1. **geneContext_ID**: Identifier of the found context. It is incrementally generated, beginning with 1
2. **Gene_family_name**: Identifier of the gene family, from the pangenome, correspond to the found context
3. **Sequence_ID**: Identifier of the searched sequence in the pangenome
4. **Nb_Genomes**: Number of genomes where the genomic context is found
5. **Partition**: Partition of the gene family corresponding to the found context
6. **Target family**: Whether the family is a target family, meaning it matches an input sequence, or a family provided as input.
6. **Target_family**: Whether the family is a target family, meaning it matches an input sequence, or a family provided as input.

In **sequence Id**, it is possible to find a NA value. This case, correspond to another gene family found in the context.
In **sequence_Id**, it is possible to find a NA value. This case, correspond to another gene family found in the context.

## Detailed options

Expand Down
2 changes: 1 addition & 1 deletion ppanggolin/RGP/genomicIsland.py
Original file line number Diff line number Diff line change
Expand Up @@ -346,7 +346,7 @@ def parser_rgp(parser: argparse.ArgumentParser):
optional.add_argument('--min_length', required=False, type=int, default=3000,
help="Minimum length (bp) of a region to be considered a RGP")
optional.add_argument("--dup_margin", required=False, type=restricted_float, default=0.05,
help="Minimum ratio of organisms where the family is present in which the family must "
help="Minimum ratio of genomes where the family is present in which the family must "
"have multiple genes for it to be considered 'duplicated'")


Expand Down
2 changes: 1 addition & 1 deletion ppanggolin/RGP/rgp_cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,7 @@ def add_info_to_rgp_nodes(graph, regions: List[Region], region_to_spot: dict):
region_attributes = {}
for region in regions:
region_info = {"contig": region.contig.name,
'organism': region.organism.name,
'genome': region.organism.name,
"name": region.name,
"genes_count": len(region),
"is_contig_border": region.is_contig_border,
Expand Down
2 changes: 1 addition & 1 deletion ppanggolin/RGP/spot.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,7 @@ def write_spot_graph(graph_spot, outdir, graph_formats, file_basename="spotGraph
graph_spot.nodes[node]["border0"] = ';'.join([fam.name for fam in graph_spot.nodes[node]["border0"]])
graph_spot.nodes[node]["border1"] = ';'.join([fam.name for fam in graph_spot.nodes[node]["border1"]])

graph_spot.nodes[node]["organisms"] = ';'.join({rgp.organism.name for rgp in graph_spot.nodes[node]["rgp"]})
graph_spot.nodes[node]["genomes"] = ';'.join({rgp.organism.name for rgp in graph_spot.nodes[node]["rgp"]})
graph_spot.nodes[node]["rgp"] = ';'.join([rgp.name for rgp in graph_spot.nodes[node]["rgp"]])

if "gexf" in graph_formats:
Expand Down
18 changes: 9 additions & 9 deletions ppanggolin/annotate/annotate.py
Original file line number Diff line number Diff line change
Expand Up @@ -495,7 +495,7 @@ def read_annotations(pangenome: Pangenome, organisms_file: Path, cpu: int = 1, p
:param disable_bar: Disable the progress bar
"""

logging.getLogger("PPanGGOLiN").info(f"Reading {organisms_file.name} the list of organism files ...")
logging.getLogger("PPanGGOLiN").info(f"Reading {organisms_file.name} the list of genome files ...")

pangenome.status["geneSequences"] = "Computed"
# we assume there are gene sequences in the annotation files,
Expand Down Expand Up @@ -551,7 +551,7 @@ def get_gene_sequences_from_fastas(pangenome: Pangenome, fasta_files: Path):
for line in read_compressed_or_not(fasta_files):
elements = [el.strip() for el in line.split("\t")]
if len(elements) <= 1:
logging.getLogger("PPanGGOLiN").error("No tabulation separator found in organisms file")
logging.getLogger("PPanGGOLiN").error("No tabulation separator found in genome file")
exit(1)
try:
org = pangenome.get_organism(elements[0])
Expand All @@ -563,7 +563,7 @@ def get_gene_sequences_from_fastas(pangenome: Pangenome, fasta_files: Path):
fasta_dict[org] = read_fasta(org, currFastaFile)
if set(pangenome.organisms) > set(fasta_dict.keys()):
missing = pangenome.number_of_organisms - len(set(pangenome.organisms) & set(fasta_dict.keys()))
raise Exception(f"Not all of your pangenome organisms are present within the provided fasta file. "
raise Exception(f"Not all of your pangenome genomes are present within the provided fasta file. "
f"{missing} are missing (out of {pangenome.number_of_organisms}).")

for org in pangenome.organisms:
Expand All @@ -574,7 +574,7 @@ def get_gene_sequences_from_fastas(pangenome: Pangenome, fasta_files: Path):
for rna in contig.RNAs:
rna.add_sequence(get_dna_sequence(fasta_dict[org][contig.name], rna))
except KeyError:
msg = f"Fasta file for organism {org.name} did not have the contig {contig.name} " \
msg = f"Fasta file for genome {org.name} did not have the contig {contig.name} " \
f"that was read from the annotation file. "
msg += f"The provided contigs in the fasta were : " \
f"{', '.join([contig for contig in fasta_dict[org].keys()])}."
Expand All @@ -600,7 +600,7 @@ def annotate_pangenome(pangenome: Pangenome, fasta_list: Path, tmpdir: str, cpu:
:param disable_bar: Disable the progress bar
"""

logging.getLogger("PPanGGOLiN").info(f"Reading {fasta_list} the list of organism files")
logging.getLogger("PPanGGOLiN").info(f"Reading {fasta_list} the list of genome files")

arguments = [] # Argument given to annotate organism in same order than prototype
for line in read_compressed_or_not(fasta_list):
Expand Down Expand Up @@ -699,11 +699,11 @@ def parser_annot(parser: argparse.ArgumentParser):
required = parser.add_argument_group(title="Required arguments",
description="One of the following arguments is required :")
required.add_argument('--fasta', required=False, type=Path,
help="A tab-separated file listing the organism names, and the fasta filepath of its genomic "
"sequence(s) (the fastas can be compressed with gzip). One line per organism.")
help="A tab-separated file listing the genome names, and the fasta filepath of its genomic "
"sequence(s) (the fastas can be compressed with gzip). One line per genome.")
required.add_argument('--anno', required=False, type=Path,
help="A tab-separated file listing the organism names, and the gff/gbff filepath of its "
"annotations (the files can be compressed with gzip). One line per organism. "
help="A tab-separated file listing the genome names, and the gff/gbff filepath of its "
"annotations (the files can be compressed with gzip). One line per genome. "
"If this is provided, those annotations will be used.")

optional = parser.add_argument_group(title="Optional arguments")
Expand Down
Loading

0 comments on commit c19a600

Please sign in to comment.