Skip to content

Commit

Permalink
add metadata in table output and in projection cmd
Browse files Browse the repository at this point in the history
  • Loading branch information
JeanMainguy committed Nov 15, 2023
1 parent 3e2e262 commit a60fbcf
Show file tree
Hide file tree
Showing 3 changed files with 76 additions and 42 deletions.
8 changes: 4 additions & 4 deletions .github/workflows/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,7 @@ jobs:
ppanggolin projection --pangenome mybasicpangenome/pangenome.h5 -o projection_from_single_fasta \
--organism_name chlam_A --fasta FASTA/GCF_002776845.1_ASM277684v1_genomic.fna.gz \
--spot_graph --graph_formats graphml --fast --keep_tmp -f --add_sequences --gff --proksee
--spot_graph --graph_formats graphml --fast --keep_tmp -f --add_sequences --gff --proksee --table --add_metadata
- name: testing write_genome_cmds
Expand All @@ -157,7 +157,7 @@ jobs:
head organisms.gbff.list | cut -f1 > organisms_names.gbff.head.list
ppanggolin write_genomes -p myannopang/pangenome.h5 --output flat_genomes_from_file_org -f \
--anno organisms.gbff.list --gff --organisms organisms_names.gbff.head.list
--anno organisms.gbff.list --gff --table --organisms organisms_names.gbff.head.list
ppanggolin write_genomes -p stepbystep/pangenome.h5 --output flat_genomes_from_cmdline_orgs --proksee \
--organisms GCF_006508185.1_ASM650818v1_genomic,GCF_002088315.1_ASM208831v1_genomic
Expand All @@ -166,9 +166,9 @@ jobs:
# Default separator is a pipe but a pipe is found in a value of metadata db1. That is why we use another separator here.
ppanggolin write_genomes -p mybasicpangenome/pangenome.h5 --output mybasicpangenome/genomes_outputs \
--organisms organisms_names.fasta.head.list \
-f --gff --add_metadata --metadata_sep §
-f --gff --add_metadata --table --metadata_sep §
# Pipe separatore is found in metadata source db1. if we don't require this source then the writting with pipe is work fine.
ppanggolin write_genomes -p mybasicpangenome/pangenome.h5 --output mybasicpangenome/genomes_outputs_with_metadata -f --gff --proksee --add_metadata --metadata_sources db2 db3 db4
ppanggolin write_genomes -p mybasicpangenome/pangenome.h5 --output mybasicpangenome/genomes_outputs_with_metadata -f --gff --proksee --table --add_metadata --metadata_sources db2 db3 db4
80 changes: 46 additions & 34 deletions ppanggolin/formats/writeFlatGenomes.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,50 +51,64 @@ def count_neighbors_partitions(gene_family: GeneFamily):
return nb_pers, nb_shell, nb_cloud


def write_tsv_genome_file(organism: Organism, output: Path, compress: bool = False,
def write_tsv_genome_file(organism: Organism, output: Path, compress: bool = False, metadata_sep:str = "|",
need_regions: bool = False, need_spots: bool = False, need_modules: bool = False):
"""
Write the table of pangenome for one organism in tsv format
Write the table of genes with pangenome annotation for one organism in tsv
:param organism: Projected organism
:param organism: An organism
:param output: Path to output directory
:param compress: Compress the file in .gz
:param need_regions: Write information about regions
:param need_spots: Write information about spots
:param need_modules: Write information about modules
:return:
"""

rows = defaultdict(list)
rows = []
for gene in organism.genes:
nb_pers, nb_shell, nb_cloud = count_neighbors_partitions(gene.family)

rows["gene"].append(gene.ID if gene.local_identifier == "" else gene.local_identifier)
rows["contig"].append(gene.contig.name)
rows["start"].append(gene.start)
rows["stop"].append(gene.stop)
rows["strand"].append(gene.strand)
rows["family"].append(gene.family.name)
rows["nb_copy_in_org"].append(len(list(gene.family.get_genes_per_org(organism))))
rows["partition"].append(gene.family.named_partition)
rows["persistent_neighbors"].append(nb_pers)
rows["shell_neighbors"].append(nb_shell)
rows["cloud_neighbors"].append(nb_cloud)
rows["RGPs"].append(str(gene.RGP) if gene.RGP is not None else None)
rows['Spots'].append(str(gene.spot) if gene.spot is not None else None)
modules = None
if gene.family.number_of_modules > 0:
modules = ','.join([str(module) for module in gene.modules])
rows['Modules'].append(modules)

if not need_regions:
rows.pop("RGPs")
if not need_spots:
rows.pop("Spots")
if not need_modules:
rows.pop("Modules")

pd.DataFrame.from_dict(rows).to_csv(output / f"{organism.name}.tsv{'.gz' if compress else ''}", sep="\t", index=False)
gene_info = {}

gene_info["gene"] = gene.ID if gene.local_identifier == "" else gene.local_identifier
gene_info["contig"] = gene.contig.name
gene_info["start"] = gene.start
gene_info["stop"] = gene.stop
gene_info["strand"] = gene.strand
gene_info["family"] = gene.family.name
gene_info["nb_copy_in_org"] = len(list(gene.family.get_genes_per_org(organism)))
gene_info["partition"] = gene.family.named_partition
gene_info["persistent_neighbors"] = nb_pers
gene_info["shell_neighbors"] = nb_shell
gene_info["cloud_neighbors"] = nb_cloud

if need_regions:
gene_info["RGP"] = str(gene.RGP) if gene.RGP is not None else None
if need_spots:
gene_info['Spot'] = str(gene.spot) if gene.spot is not None else None
if need_modules:
# TODO: Check coherency..
# gene and family should not have more than one module.
# Why is it different here?
modules = None
if gene.family.number_of_modules > 0:
modules = ','.join([str(module) for module in gene.modules])
gene_info['Module'] = modules

# Add metadata
gene_metadata = {f"gene_{key}":value for key, value in gene.formatted_metadata_dict(metadata_sep).items()}
gene_info.update(gene_metadata)

family_metadata = {f"family_{key}":value for key, value in gene.family.formatted_metadata_dict(metadata_sep).items()}
gene_info.update(family_metadata)
if need_regions and gene.RGP:
rgp_metadata = {f"rgp_{key}":value for key, value in gene.RGP.formatted_metadata_dict(metadata_sep).items()}
gene_info.update(rgp_metadata)

rows.append(gene_info)

pd.DataFrame(rows).to_csv(output / f"{organism.name}.tsv{'.gz' if compress else ''}", sep="\t", index=False)

logging.getLogger("PPangGGOLiN").debug(f"Done writing the table with pangenome annotation for {organism.name}")

Expand Down Expand Up @@ -385,7 +399,6 @@ def get_organism_list(organisms_filt: str, pangenome: Pangenome) -> Set[Organism
"to determine which genomes should be included in the output.")
with open(organisms_filt) as fl:
org_names = [line.strip() for line in fl if line and not line.startswith("#")]
print(org_names)
else:
org_names = [name.strip() for name in organisms_filt.split(',') if name.strip()]

Expand Down Expand Up @@ -444,7 +457,7 @@ def mp_write_genomes_file(organism: Organism, output: Path, organisms_file: Path
if table:
write_tsv_genome_file(organism=organism, output=org_outdir, **{arg: kwargs[arg] for arg in kwargs.keys() &
{'need_regions', 'need_modules', 'need_spots',
'compress'}})
'compress', 'metadata_sep'}})

return organism.name

Expand Down Expand Up @@ -490,7 +503,6 @@ def write_flat_genome_files(pangenome: Pangenome, output: Path, table: bool = Fa
# Place here to raise an error if file doesn't found before to read pangenome
organisms_file = fasta if fasta is not None else anno

# TODO: see if we export metadata in write_genomes command
check_pangenome_info(pangenome, disable_bar=disable_bar, **need_dict)

organisms_list = get_organism_list(organisms_filt, pangenome)
Expand Down
30 changes: 26 additions & 4 deletions ppanggolin/projection/projection.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
from ppanggolin.pangenome import Pangenome
from ppanggolin.utils import detect_filetype, create_tmpdir, read_compressed_or_not, write_compressed_or_not, \
restricted_float, mk_outdir, get_config_args, parse_config_file, get_default_args, \
check_input_files, parse_input_paths_file, flatten_nested_dict
check_input_files, parse_input_paths_file
from ppanggolin.align.alignOnPang import write_gene_to_gene_family, get_input_seq_to_family_with_rep, \
get_input_seq_to_family_with_all, project_and_write_partition
from ppanggolin.formats.writeSequences import write_gene_sequences_from_annotations
Expand Down Expand Up @@ -104,9 +104,13 @@ def launch(args: argparse.Namespace):
raise Exception("The provided pangenome has no gene families sequences. "
"This is not possible to annotate an input organism to this pangenome.")

# Projected genomes have no metadata so the only element metadata that can be used in projected genomes is modules and families
metadata_types = ['modules', "families"]

check_pangenome_info(pangenome, need_annotations=True, need_families=True, disable_bar=args.disable_prog_bar,
need_rgp=predict_rgp, need_modules=project_modules, need_gene_sequences=False,
need_spots=project_spots, need_metadata=True, metatypes=['families'])
need_spots=project_spots, need_metadata=args.add_metadata, sources=args.metadata_sources,
metatypes=metadata_types)

logging.getLogger('PPanGGOLiN').info('Retrieving parameters from the provided pangenome file.')
pangenome_params = argparse.Namespace(
Expand Down Expand Up @@ -272,11 +276,11 @@ def launch(args: argparse.Namespace):

write_gff_file(organism, output_dir / organism.name,
annotation_sources=annotation_sources,
genome_sequences=genome_sequences,
genome_sequences=genome_sequences, metadata_sep=args.metadata_sep,
compress=args.compress)

if args.table:
write_tsv_genome_file(organism, output_dir / organism.name, compress=args.compress,
write_tsv_genome_file(organism, output_dir / organism.name, compress=args.compress, metadata_sep=args.metadata_sep,
need_regions=predict_rgp, need_spots=project_spots, need_modules=project_modules)

output_file = output_dir / "summary_projection.tsv",
Expand Down Expand Up @@ -1309,3 +1313,21 @@ def parser_projection(parser: argparse.ArgumentParser):

optional.add_argument("--keep_tmp", required=False, default=False, action="store_true",
help="Keeping temporary files (useful for debugging).")

optional.add_argument("--add_metadata",
required=False,
action="store_true",
help="Include metadata information in the output files "
"if any have been added to pangenome elements (see ppanggolin metadata command).")

optional.add_argument("--metadata_sources",
default=None,
nargs="+",
help="Which source of metadata should be written. "
"By default all metadata sources are included.")

optional.add_argument("--metadata_sep",
required=False,
default='|',
help="The separator used to join multiple metadata values for elements with multiple metadata"
" values from the same source. This character should not appear in metadata values.")

0 comments on commit a60fbcf

Please sign in to comment.