Skip to content

Commit

Permalink
Merge pull request #291 from labgem/filter_non_ascii_char
Browse files Browse the repository at this point in the history
Filter non ASCII character
  • Loading branch information
jpjarnoux authored Oct 21, 2024
2 parents 561d81b + fbe6132 commit 1fd1092
Show file tree
Hide file tree
Showing 3 changed files with 71 additions and 3 deletions.
22 changes: 21 additions & 1 deletion ppanggolin/annotate/annotate.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
init_contig_counter, contig_counter)
from ppanggolin.pangenome import Pangenome
from ppanggolin.genome import Organism, Gene, RNA, Contig
from ppanggolin.utils import read_compressed_or_not, mk_file_name, detect_filetype, check_input_files
from ppanggolin.utils import read_compressed_or_not, mk_file_name, detect_filetype, check_input_files, has_non_ascii, replace_non_ascii
from ppanggolin.formats import write_pangenome
from ppanggolin.metadata import Metadata

Expand Down Expand Up @@ -53,6 +53,8 @@ def check_annotate_args(args: argparse.Namespace):
check_input_files(args.anno, True)




def create_gene(org: Organism, contig: Contig, gene_counter: int, rna_counter: int, gene_id: str, dbxrefs: Set[str],
coordinates: List[Tuple[int, int]], strand: str, gene_type: str, position: int = None,
gene_name: str = "", product: str = "", genetic_code: int = 11, protein_id: str = "") -> Gene:
Expand All @@ -74,6 +76,15 @@ def create_gene(org: Organism, contig: Contig, gene_counter: int, rna_counter: i
:param genetic_code: Genetic code used
:param protein_id: Protein identifier
"""
# check for non ascii character in product field
if has_non_ascii(product):

logging.getLogger("PPanGGOLiN").warning(
f"In genome '{org.name}', the 'product' field of gene '{gene_id}' contains non-ASCII characters: '{product}'. "
"These characters cannot be stored in the HDF5 file and will be replaced by underscores."
)
product = replace_non_ascii(product)


start, stop = coordinates[0][0], coordinates[-1][1]

Expand Down Expand Up @@ -889,6 +900,15 @@ def check_chevrons_in_start_and_stop(start: str, stop: str) -> Tuple[int, int, b
is_partial = False

product = attributes.pop('PRODUCT', "")

if has_non_ascii(product):

logging.getLogger("PPanGGOLiN").warning(
f"In genome '{organism}', the 'product' field of gene '{gene_id}' contains non-ASCII characters: '{product}'. "
"These characters cannot be stored in the HDF5 file and will be replaced by underscores."
)
product = replace_non_ascii(product)


if contig is None or contig.name != fields_gff[gff_seqname]:
# get the current contig
Expand Down
25 changes: 25 additions & 0 deletions ppanggolin/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -1254,3 +1254,28 @@ def run_subprocess(cmd: List[str], output: Path = None, msg: str = "Subprocess f
if output is not None:
with open(output, 'w') as fout:
fout.write(result.stdout)



def has_non_ascii(string_to_test: str) -> bool:
"""
Check if a string contains any non-ASCII characters.
:param string_to_test: The string to check for non-ASCII characters.
:return: True if the string contains non-ASCII characters, False otherwise.
"""
try:
string_to_test.encode('ascii')
except UnicodeEncodeError:
return True
return False

def replace_non_ascii(string_with_ascii: str, replacement_string: str = "_") -> str:
"""
Replace all non-ASCII characters in a string with a specified replacement string.
:param string_with_ascii: The string potentially containing non-ASCII characters.
:param replacement_string: The string to replace non-ASCII characters with (default is '_').
:return: A new string where all non-ASCII characters have been replaced.
"""
return re.sub(r'[^\x00-\x7F]+', replacement_string, string_with_ascii)
27 changes: 25 additions & 2 deletions tests/utils/test_utilities.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,7 @@
import zipfile
from typing import Generator

from ppanggolin.utils import is_compressed, read_compressed_or_not, write_compressed_or_not

from ppanggolin.utils import is_compressed, read_compressed_or_not, write_compressed_or_not, has_non_ascii, replace_non_ascii

class TestCompressed:
"""
Expand Down Expand Up @@ -157,3 +156,27 @@ def test_write_uncompressed(self, plain_file_path: Path) -> None:
f.write("Test data")
with open(plain_file_path, 'r') as f:
assert f.read() == "Test data"


# Test cases for has_non_ascii
@pytest.mark.parametrize("input_string, expected", [
("Escherichia_coli", False), # All ASCII characters
("Escherichia_colí", True), # Contains non-ASCII character 'í'
("simple_string", False), # Simple ASCII string
("Ωmega", True), # Contains non-ASCII character 'Ω'
("", False), # Empty string should return False
])
def test_has_non_ascii(input_string, expected):
assert has_non_ascii(input_string) == expected

# Test cases for replace_non_ascii
@pytest.mark.parametrize("input_string, replacement, expected", [
("Escherichia_coli", "_", "Escherichia_coli"), # All ASCII characters, no replacement needed
("Escherichia_colí", "_", "Escherichia_col_"), # Replace 'í' with '_'
("Ωmega", "-", "-mega"), # Replace 'Ω' with '-'
("Escherichia_Ωcoli", "X", "Escherichia_Xcoli"),# Replace 'Ω' with 'X'
("", "_", ""), # Empty string, no replacement
])
def test_replace_non_ascii(input_string, replacement, expected):
assert replace_non_ascii(input_string, replacement) == expected

0 comments on commit 1fd1092

Please sign in to comment.