diff --git a/tests/test_load_corpus.py b/tests/test_load_corpus.py index 36713d52..ada3921a 100644 --- a/tests/test_load_corpus.py +++ b/tests/test_load_corpus.py @@ -578,9 +578,18 @@ def test_normalise(self): expected_default = "hello mr how are you doing s o" self.assertEqual(results, expected_default) # WHEN + results = superstyl.preproc.pipe.normalise(text, no_ascii=True) + # THEN + expected_default = "hello mr 𓀁 how are you doing ſ õ" + self.assertEqual(results, expected_default) + # WHEN results = superstyl.preproc.pipe.normalise(text, keep_punct=True) # THEN expected_keeppunct = "Hello, Mr. , how are SSSS you; doing? s o" + # WHEN + results = superstyl.preproc.pipe.normalise(text, keep_punct=True, no_ascii=True) + # THEN + expected_keeppunct = "Hello, Mr. 𓀁, how are §§ you; doing? ſ õ" self.assertEqual(results, expected_keeppunct) # WHEN results = superstyl.preproc.pipe.normalise(text, keep_sym=True) @@ -600,6 +609,20 @@ def test_normalise(self): # gives: 'Coucou 😵 💫' # because of the way NFC normalisation is handled probably + # Test for Armenian + # GIVEN + text = " քան զսակաւս ։ Ահա նշանագրեցի" + # WHEN + results = superstyl.preproc.pipe.normalise(text, no_ascii=True) + # THEN + expected_default = "քան զսակաւս ահա նշանագրեցի" + self.assertEqual(results, expected_default) + # WHEN + results = superstyl.preproc.pipe.normalise(text, keep_punct=True, no_ascii=True) + # THEN + expected_keeppunct = "քան զսակաւս ։ Ահա նշանագրեցի" + self.assertEqual(results, expected_keeppunct) + def test_detect_lang(self): french = "Bonjour, Monsieur, comment allez-vous?" # NB: it fails on that !!!