From e12929a909e7871684c51dc8e6b7d5382d4f8d57 Mon Sep 17 00:00:00 2001 From: Inga Ulusoy Date: Mon, 2 Dec 2024 15:01:50 +0100 Subject: [PATCH] manage occurence of full stops in a better way (#229) * manage occurence of full stops in a better way * bump version * cleanup --- ammico/test/test_text.py | 13 +++++++++++++ ammico/text.py | 42 +++++++++++++++++++++++++++++++++------- pyproject.toml | 2 +- 3 files changed, 49 insertions(+), 8 deletions(-) diff --git a/ammico/test/test_text.py b/ammico/test/test_text.py index 67ae0c76..5ebb00df 100644 --- a/ammico/test/test_text.py +++ b/ammico/test/test_text.py @@ -141,6 +141,19 @@ def test_init_revision_numbers_and_models(accepted): tt.TextDetector({}, revision_numbers=["something"], accept_privacy=accepted) +def test_check_add_space_after_full_stop(accepted): + test_obj = tt.TextDetector({}, accept_privacy=accepted) + test_obj.subdict["text"] = "I like cats. I like dogs." + test_obj._check_add_space_after_full_stop() + assert test_obj.subdict["text"] == "I like cats. I like dogs." + test_obj.subdict["text"] = "I like cats." + test_obj._check_add_space_after_full_stop() + assert test_obj.subdict["text"] == "I like cats." + test_obj.subdict["text"] = "www.icanhascheezburger.com" + test_obj._check_add_space_after_full_stop() + assert test_obj.subdict["text"] == "www. icanhascheezburger. com" + + @pytest.mark.gcv def test_analyse_image(set_testdict, set_environ, accepted): for item in set_testdict: diff --git a/ammico/text.py b/ammico/text.py index 61499022..0d020afc 100644 --- a/ammico/text.py +++ b/ammico/text.py @@ -4,6 +4,7 @@ import spacy import io import os +import re from ammico.utils import AnalysisMethod import grpc import pandas as pd @@ -225,6 +226,39 @@ def _initialize_spacy(self): spacy.cli.download("en_core_web_md") self.nlp = spacy.load("en_core_web_md") + def _check_add_space_after_full_stop(self): + """Add a space after a full stop. Required by googletrans.""" + # we have found text, now we check for full stops + index_stop = [ + i.start() for i in re.finditer("\.", self.subdict["text"]) # noqa + ] + if not index_stop: # no full stops found + return + # check if this includes the last string item + end_of_list = False + if len(self.subdict["text"]) <= (index_stop[-1] + 1): + # the last found full stop is at the end of the string + # but we can include all others + if len(index_stop) == 1: + end_of_list = True + else: + index_stop.pop() + if end_of_list: # only one full stop at end of string + return + # if this is not the end of the list, check if there is a space after the full stop + no_space = [i for i in index_stop if self.subdict["text"][i + 1] != " "] + if not no_space: # all full stops have a space after them + return + # else, amend the text + add_one = 1 + for i in no_space: + self.subdict["text"] = ( + self.subdict["text"][: i + add_one] + + " " + + self.subdict["text"][i + add_one :] + ) + add_one += 1 + def analyse_image(self) -> dict: """Perform text extraction and analysis of the text. @@ -239,13 +273,7 @@ def analyse_image(self) -> dict: else: # make sure all full stops are followed by whitespace # otherwise googletrans breaks - index_stop = self.subdict["text"].find(".") - if self.subdict["text"][index_stop + 1] != " ": - self.subdict["text"] = ( - self.subdict["text"][: index_stop + 1] - + " " - + self.subdict["text"][index_stop + 1 :] - ) + self._check_add_space_after_full_stop() self.translate_text() self.remove_linebreaks() if self.analyse_text: diff --git a/pyproject.toml b/pyproject.toml index ff1e8516..cbd55bea 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "hatchling.build" [project] name = "ammico" -version = "0.2.3" +version = "0.2.4" description = "AI Media and Misinformation Content Analysis Tool" readme = "README.md" maintainers = [