Merge pull request #291 from labgem/filter_non_ascii_char

Filter non ASCII character
labgem · Oct 21, 2024 · 1fd1092 · 1fd1092
2 parents 561d81b + fbe6132
commit 1fd1092
Show file tree

Hide file tree

Showing 3 changed files with 71 additions and 3 deletions.
diff --git a/ppanggolin/annotate/annotate.py b/ppanggolin/annotate/annotate.py
@@ -24,7 +24,7 @@
                                        init_contig_counter, contig_counter)
 from ppanggolin.pangenome import Pangenome
 from ppanggolin.genome import Organism, Gene, RNA, Contig
-from ppanggolin.utils import read_compressed_or_not, mk_file_name, detect_filetype, check_input_files
+from ppanggolin.utils import read_compressed_or_not, mk_file_name, detect_filetype, check_input_files, has_non_ascii, replace_non_ascii
 from ppanggolin.formats import write_pangenome
 from ppanggolin.metadata import Metadata
 
@@ -53,6 +53,8 @@ def check_annotate_args(args: argparse.Namespace):
         check_input_files(args.anno, True)
 
 
+
+
 def create_gene(org: Organism, contig: Contig, gene_counter: int, rna_counter: int, gene_id: str, dbxrefs: Set[str],
                 coordinates: List[Tuple[int, int]], strand: str, gene_type: str, position: int = None,
                 gene_name: str = "", product: str = "", genetic_code: int = 11, protein_id: str = "") -> Gene:
@@ -74,6 +76,15 @@ def create_gene(org: Organism, contig: Contig, gene_counter: int, rna_counter: i
     :param genetic_code: Genetic code used
     :param protein_id: Protein identifier
     """
+    # check for non ascii character in product field
+    if has_non_ascii(product):
+
+        logging.getLogger("PPanGGOLiN").warning(
+                f"In genome '{org.name}', the 'product' field of gene '{gene_id}' contains non-ASCII characters: '{product}'. "
+                "These characters cannot be stored in the HDF5 file and will be replaced by underscores."
+            )
+        product = replace_non_ascii(product)
+
 
     start, stop = coordinates[0][0], coordinates[-1][1]
 
@@ -889,6 +900,15 @@ def check_chevrons_in_start_and_stop(start: str, stop: str) -> Tuple[int, int, b
                         is_partial = False
 
                     product = attributes.pop('PRODUCT', "")
+
+                    if has_non_ascii(product):
+
+                        logging.getLogger("PPanGGOLiN").warning(
+                                f"In genome '{organism}', the 'product' field of gene '{gene_id}' contains non-ASCII characters: '{product}'. "
+                                "These characters cannot be stored in the HDF5 file and will be replaced by underscores."
+                            )
+                        product = replace_non_ascii(product)
+
 
                     if contig is None or contig.name != fields_gff[gff_seqname]:
                         # get the current contig

diff --git a/ppanggolin/utils.py b/ppanggolin/utils.py
@@ -1254,3 +1254,28 @@ def run_subprocess(cmd: List[str], output: Path = None, msg: str = "Subprocess f
         if output is not None:
             with open(output, 'w') as fout:
                 fout.write(result.stdout)
+
+
+
+def has_non_ascii(string_to_test: str) -> bool:
+    """
+    Check if a string contains any non-ASCII characters.
+
+    :param string_to_test: The string to check for non-ASCII characters.
+    :return: True if the string contains non-ASCII characters, False otherwise.
+    """
+    try:
+        string_to_test.encode('ascii')
+    except UnicodeEncodeError:
+        return True
+    return False
+
+def replace_non_ascii(string_with_ascii: str, replacement_string: str = "_") -> str:
+    """
+    Replace all non-ASCII characters in a string with a specified replacement string.
+
+    :param string_with_ascii: The string potentially containing non-ASCII characters.
+    :param replacement_string: The string to replace non-ASCII characters with (default is '_').
+    :return: A new string where all non-ASCII characters have been replaced.
+    """
+    return re.sub(r'[^\x00-\x7F]+', replacement_string, string_with_ascii)
diff --git a/tests/utils/test_utilities.py b/tests/utils/test_utilities.py
@@ -7,8 +7,7 @@
 import zipfile
 from typing import Generator
 
-from ppanggolin.utils import is_compressed, read_compressed_or_not, write_compressed_or_not
-
+from ppanggolin.utils import is_compressed, read_compressed_or_not, write_compressed_or_not,  has_non_ascii, replace_non_ascii
 
 class TestCompressed:
     """
@@ -157,3 +156,27 @@ def test_write_uncompressed(self, plain_file_path: Path) -> None:
             f.write("Test data")
         with open(plain_file_path, 'r') as f:
             assert f.read() == "Test data"
+
+
+# Test cases for has_non_ascii
+@pytest.mark.parametrize("input_string, expected", [
+    ("Escherichia_coli", False),  # All ASCII characters
+    ("Escherichia_colí", True),   # Contains non-ASCII character 'í'
+    ("simple_string", False),     # Simple ASCII string
+    ("Ωmega", True),              # Contains non-ASCII character 'Ω'
+    ("", False),                  # Empty string should return False
+])
+def test_has_non_ascii(input_string, expected):
+    assert has_non_ascii(input_string) == expected
+
+# Test cases for replace_non_ascii
+@pytest.mark.parametrize("input_string, replacement, expected", [
+    ("Escherichia_coli", "_", "Escherichia_coli"),  # All ASCII characters, no replacement needed
+    ("Escherichia_colí", "_", "Escherichia_col_"),  # Replace 'í' with '_'
+    ("Ωmega", "-", "-mega"),                        # Replace 'Ω' with '-'
+    ("Escherichia_Ωcoli", "X", "Escherichia_Xcoli"),# Replace 'Ω' with 'X'
+    ("", "_", ""),                                  # Empty string, no replacement
+])
+def test_replace_non_ascii(input_string, replacement, expected):
+    assert replace_non_ascii(input_string, replacement) == expected
+