Skip to content

Commit

Permalink
Merge branch 'AnnotJoin' into improve_metadata
Browse files Browse the repository at this point in the history
  • Loading branch information
JeanMainguy committed May 27, 2024
2 parents 63183c3 + d39fb84 commit cf5d5ee
Show file tree
Hide file tree
Showing 5 changed files with 42 additions and 30 deletions.
2 changes: 1 addition & 1 deletion docs/user/projection.md
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ Additionally, within the Output directory, there is a subdirectory for each inpu
For Gene Family and Partition of Input Genes:

- `cds_sequences.fasta`: This file contains the sequences of coding regions (CDS) from the input genome.
- `gene_to_gene_family.tsv`: It provides the mapping of genes to gene families of the pangenome. Its format follows [this output](Outputs.md#gene-families-and-genes)
- `gene_to_gene_family.tsv`: It provides the mapping of genes to gene families of the pangenome. Its format follows [this output](PangenomeAnalyses/pangenomeAnalyses.md#gene-families-to-genes-associations)
- `sequences_partition_projection.tsv`: This file maps the input genes to its partition (Persistent, Shell or Cloud).
- `specific_genes.tsv`: This file lists the gene of the input genomes that do not align to any gene of the pangenome. These genes are assigned to Cloud parititon.

Expand Down
21 changes: 12 additions & 9 deletions ppanggolin/align/alignOnPang.py
Original file line number Diff line number Diff line change
Expand Up @@ -287,22 +287,25 @@ def project_and_write_partition(seqid_to_gene_family: Dict[str, GeneFamily], seq

def write_gene_to_gene_family(seqid_to_gene_family: Dict[str, GeneFamily], seq_set: Set[str], output: Path) -> Path:
"""
Write input gene to gene family.
Write input gene to pangenome gene family.
:param seqid_to_gene_family: dictionnary which link sequence and pangenome gene family
:param seqid_to_gene_family: dictionnary which links input sequence and pangenome gene family
:param seq_set: input sequences
:param output: Path of the output directory
:return: Path to file which contain partition projection
:return: Path to the file which contains gene to gene family projection results
"""

gene_fam_map_file = output.absolute() / "gene_to_gene_family.tsv"
with open(gene_fam_map_file, "w") as partProjFile:
for input_seq, gene_fam in seqid_to_gene_family.items():
partProjFile.write(f"{input_seq}\t{gene_fam.name}\n")

for remaining_seq in seq_set - seqid_to_gene_family.keys():
partProjFile.write(f"{remaining_seq}\t{remaining_seq}\n") # if there is no hit, gene family is itself.
with open(gene_fam_map_file, "w") as cluster_proj_file:
for input_seq in seq_set:
# get the seq gene family and if there is no hit, itself
gene_family = seqid_to_gene_family.get(input_seq)
if gene_family is None:
gene_family_name = input_seq
else:
gene_family_name = gene_family.name
cluster_proj_file.write(f"{input_seq}\t{gene_family_name}\n")

return gene_fam_map_file

Expand Down
5 changes: 3 additions & 2 deletions ppanggolin/annotate/annotate.py
Original file line number Diff line number Diff line change
Expand Up @@ -419,8 +419,9 @@ def read_org_gbff(organism_name: str, gbff_file_path: Path, circular_contigs: Li
contig_len = int(header['LOCUS'].split()[1])

if contig_len != len(sequence):
raise ValueError("The contig length defined in LOCUS section is different than the length of the dna sequence. "
f"For contig {contig_id} in {gbff_file_path}")
logging.getLogger("PPanGGOLiN").warning("Unable to determine if the contig is circular or linear in file "
f"'{gbff_file_path}' from the LOCUS line: {line}. "
"By default, the contig will be considered linear.")

if "CIRCULAR" in header['LOCUS'].upper():
# this line contains linear/circular word telling if the dna sequence is circularized or not
Expand Down
38 changes: 23 additions & 15 deletions ppanggolin/formats/writeMSA.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,25 +135,26 @@ def write_fasta_families(family: GeneFamily, tmpdir: tempfile.TemporaryDirectory
# have a directory for each gene family, to make deletion of tmp files simpler

f_name = Path(tmpdir.name) / f"{family.name}.fasta"
f_obj = open(f_name, "w")

# get genes that are present in only one copy for our family in each organism.
single_copy_genes = []
for _, genes in family.get_org_dict().items():
if len(genes) == 1:
single_copy_genes.extend(genes)

for gene in single_copy_genes:
if use_gene_id:
f_obj.write('>' + gene.ID + "\n")
else:
f_obj.write('>' + gene.organism.name + "\n")
if source == "dna":
f_obj.write(gene.dna + '\n')
elif source == "protein":
f_obj.write(translate(gene.dna, code_table) + "\n")
else:
raise AssertionError("Unknown sequence source given (expected 'dna' or 'protein')")
f_obj.flush()

with open(f_name, "w") as f_obj:

for gene in single_copy_genes:
if use_gene_id:
f_obj.write(f">{gene.ID}\n")
else:
f_obj.write(f">{gene.organism.name}\n")
if source == "dna":
f_obj.write(gene.dna + '\n')
elif source == "protein":
f_obj.write(translate(gene.dna, code_table) + "\n")
else:
raise ValueError(f"Unknown sequence source '{source}' provided. Expected 'dna' or 'protein'.")

return f_name

Expand All @@ -169,7 +170,14 @@ def launch_mafft(fname: Path, output: Path, fam_name: str):
outname = output / f"{fam_name}.aln"
cmd = ["mafft", "--thread", "1", fname.absolute().as_posix()]
logging.getLogger("PPanGGOLiN").debug("command: " + " ".join(cmd))
subprocess.run(cmd, stdout=open(outname, "w"), stderr=subprocess.DEVNULL, check=True)
result = subprocess.run(cmd, capture_output=True)

with open(outname, "w") as fout:
fout.write(result.stdout.decode('utf-8'))

if result.stderr and result.returncode != 0:
raise Exception("mafft failed with the following error:\n"
f"{result.stderr.decode('utf-8')}")


def launch_multi_mafft(args: List[Tuple[Path, Path, str]]):
Expand Down
6 changes: 3 additions & 3 deletions ppanggolin/region.py
Original file line number Diff line number Diff line change
Expand Up @@ -393,10 +393,10 @@ def is_contig_border(self) -> bool:
assert len(self) > 0, "Your region has no genes. Something wrong happened."

if not self.contig.is_circular:
min_pos = min(self.contig.genes, key=lambda x: x.position).position
max_pos = max(self.contig.genes, key=lambda x: x.position).position
first_gene = self.contig[0]
last_gene = self.contig[-1]

if self.starter.position == min_pos or self.stopper.position == max_pos:
if self.starter == first_gene or self.stopper == last_gene:
return True
return False

Expand Down

0 comments on commit cf5d5ee

Please sign in to comment.