Skip to content

Commit

Permalink
filter out non ascii character when found in product
Browse files Browse the repository at this point in the history
  • Loading branch information
JeanMainguy committed Oct 18, 2024
1 parent 561d81b commit 0b6d1df
Show file tree
Hide file tree
Showing 2 changed files with 46 additions and 1 deletion.
22 changes: 21 additions & 1 deletion ppanggolin/annotate/annotate.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
init_contig_counter, contig_counter)
from ppanggolin.pangenome import Pangenome
from ppanggolin.genome import Organism, Gene, RNA, Contig
from ppanggolin.utils import read_compressed_or_not, mk_file_name, detect_filetype, check_input_files
from ppanggolin.utils import read_compressed_or_not, mk_file_name, detect_filetype, check_input_files, has_non_ascii, replace_non_ascii
from ppanggolin.formats import write_pangenome
from ppanggolin.metadata import Metadata

Expand Down Expand Up @@ -53,6 +53,8 @@ def check_annotate_args(args: argparse.Namespace):
check_input_files(args.anno, True)




def create_gene(org: Organism, contig: Contig, gene_counter: int, rna_counter: int, gene_id: str, dbxrefs: Set[str],
coordinates: List[Tuple[int, int]], strand: str, gene_type: str, position: int = None,
gene_name: str = "", product: str = "", genetic_code: int = 11, protein_id: str = "") -> Gene:
Expand All @@ -74,6 +76,15 @@ def create_gene(org: Organism, contig: Contig, gene_counter: int, rna_counter: i
:param genetic_code: Genetic code used
:param protein_id: Protein identifier
"""
# check for non ascii character in product field
if has_non_ascii(product):

logging.getLogger("PPanGGOLiN").warning(
f"In genome '{org.name}', the 'product' field of gene '{gene_id}' contains non-ASCII characters: '{product}'. "
"These characters cannot be stored in the HDF5 file and will be replaced by underscores."
)
product = replace_non_ascii(product)


start, stop = coordinates[0][0], coordinates[-1][1]

Expand Down Expand Up @@ -889,6 +900,15 @@ def check_chevrons_in_start_and_stop(start: str, stop: str) -> Tuple[int, int, b
is_partial = False

product = attributes.pop('PRODUCT', "")

if has_non_ascii(product):

logging.getLogger("PPanGGOLiN").warning(
f"In genome '{organism}', the 'product' field of gene '{gene_id}' contains non-ASCII characters: '{product}'. "
"These characters cannot be stored in the HDF5 file and will be replaced by underscores."
)
product = replace_non_ascii(product)


if contig is None or contig.name != fields_gff[gff_seqname]:
# get the current contig
Expand Down
25 changes: 25 additions & 0 deletions ppanggolin/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -1254,3 +1254,28 @@ def run_subprocess(cmd: List[str], output: Path = None, msg: str = "Subprocess f
if output is not None:
with open(output, 'w') as fout:
fout.write(result.stdout)



def has_non_ascii(string_to_test: str) -> bool:
"""
Check if a string contains any non-ASCII characters.
:param string_to_test: The string to check for non-ASCII characters.
:return: True if the string contains non-ASCII characters, False otherwise.
"""
try:
string_to_test.encode('ascii')
except UnicodeEncodeError:
return True
return False

def replace_non_ascii(string_with_ascii: str, replacement_string: str = "_") -> str:
"""
Replace all non-ASCII characters in a string with a specified replacement string.
:param string_with_ascii: The string potentially containing non-ASCII characters.
:param replacement_string: The string to replace non-ASCII characters with (default is '_').
:return: A new string where all non-ASCII characters have been replaced.
"""
return re.sub(r'[^\x00-\x7F]+', replacement_string, string_with_ascii)

0 comments on commit 0b6d1df

Please sign in to comment.