Release 1.2.0 (#16)

* Add method to determine if specific encoding is multi byte * Add has_submatch property on a match * Disallow calling __eq__ on different object type * Add percent_chaos and percent_coherence Better readability on chaos and coherence instead of ratio between 0. and 1. * Coherence ratio based on mean instead of sum of best results * Using loguru for trace/debug <3 * best() method rewrited * from_byte method improved - new parameters - debug available with loguru - probe chaos improved for hiragana and katakana * Experimental, hook on UnicodeDecodeError Provide encoding detection on decoding error. * bump 1.2.0 add loguru dep+ * Add test CLI normalize file without replacing it
jawah · Sep 28, 2019 · 6ea66b2 · 6ea66b2
1 parent 5abfb83
commit 6ea66b2
Show file tree

Hide file tree

Showing 10 changed files with 227 additions and 62 deletions.
diff --git a/charset_normalizer/__init__.py b/charset_normalizer/__init__.py
@@ -5,3 +5,4 @@
 from charset_normalizer.probe_coherence import ProbeCoherence
 from charset_normalizer.probe_words import ProbeWords
 from charset_normalizer.legacy import detect
+from charset_normalizer.hook import charset_normalizer_hook
diff --git a/charset_normalizer/constant.py b/charset_normalizer/constant.py
@@ -4,6 +4,9 @@
 Scrapped from https://unicode-table.com/
 """
 from codecs import BOM_UTF8, BOM_UTF16_BE, BOM_UTF16_LE, BOM_UTF32_BE, BOM_UTF32_LE
+from _multibytecodec import MultibyteIncrementalDecoder
+
+MULTI_BYTE_DECODER = MultibyteIncrementalDecoder
 
 UNICODE_RANGES = [
     "0000−001F",

diff --git a/charset_normalizer/encoding.py b/charset_normalizer/encoding.py
@@ -0,0 +1,15 @@
+from charset_normalizer.constant import MULTI_BYTE_DECODER
+import importlib
+
+
+def is_multi_byte_encoding(encoding_name):
+    """
+    Verify is a specific encoding is a multi byte one based on it IANA name
+    :param str encoding_name: IANA encoding name
+    :return: True if multi byte
+    :rtype: bool
+    """
+    return issubclass(
+        importlib.import_module('encodings.{encoding_name}'.format(encoding_name=encoding_name)).IncrementalDecoder,
+        MULTI_BYTE_DECODER
+    )
diff --git a/charset_normalizer/hook.py b/charset_normalizer/hook.py
@@ -0,0 +1,14 @@
+import sys
+from charset_normalizer.legacy import detect
+
+
+def charset_normalizer_hook(exctype, value, traceback):
+    if exctype == UnicodeDecodeError:
+        cp_detection = detect(value.object)
+        if cp_detection['encoding'] is not None:
+            value.reason = value.reason+'; you may want to consider {} codec for this sequence.'.format(cp_detection['encoding'])
+
+    sys.__excepthook__(exctype, value, traceback)
+
+
+sys.excepthook = charset_normalizer_hook
diff --git a/charset_normalizer/normalizer.py b/charset_normalizer/normalizer.py
diff --git a/charset_normalizer/probe_chaos.py b/charset_normalizer/probe_chaos.py
@@ -14,10 +14,12 @@
 @lru_cache(maxsize=8192)
 class ProbeChaos:
 
-    def __init__(self, string, giveup_threshold=0.09):
+    def __init__(self, string, giveup_threshold=0.09, bonus_bom_sig=False, bonus_multi_byte=False):
         """
         :param str string:
         :param float giveup_threshold: When to give up even if _probe has not finished yet
+        :param bool bonus_bom_sig: Decide if ratio should take in consideration a bonus because of BOM/SIG
+        :param bool bonus_multi_byte: Decide if ratio should take in consideration a bonus because of multi byte scheme decoder
         """
 
         if not isinstance(string, str):
@@ -26,6 +28,9 @@ def __init__(self, string, giveup_threshold=0.09):
         self._string = string
         self._threshold = giveup_threshold
 
+        self._bonus_bom_sig = bonus_bom_sig
+        self._bonus_multi_byte = bonus_multi_byte
+
         self.successive_upper_lower = 0
         self.successive_accent = 0
         self.successive_different_unicode_range = 0
@@ -46,14 +51,18 @@ def __init__(self, string, giveup_threshold=0.09):
         self.total_letter_encountered = 0
 
         self.total_lower_letter_encountered = 0
+        self.total_upper_letter_encountered = 0
+
         self.total_upper_accent_encountered = 0
         self.total_upper_accent_encountered_inner = 0
+
         self.total_unaccented_letter_encountered = 0
 
         self._probe_word = ProbeWords(HashableCounter(self._string.split()))
 
         self.gave_up = False
 
+        # Artificially increase string size to get more significant result.
         if 32 > len(self._string) > 0:
             self._string *= int(32 / len(self._string)) + 1
 
@@ -165,6 +174,9 @@ def _probe(self):
             if is_lower:
                 self.total_lower_letter_encountered += 1
 
+            if is_upper:
+                self.total_upper_letter_encountered += 1
+
             if is_upper and is_accent:
                 self.total_upper_accent_encountered += 1
                 if self.previous_printable_letter.isalpha():
@@ -237,7 +249,14 @@ def ratio(self):
         :return: Ratio as floating number
         :rtype: float
         """
-        r_ = self.total_upper_accent_encountered if self.total_letter_encountered > 0 and self.total_unaccented_letter_encountered / self.total_letter_encountered < 0.5 else 0
+
+        r_ = self.total_upper_accent_encountered if self.total_unaccented_letter_encountered / self.total_letter_encountered < 0.5 else 0
+        q_ = self.total_upper_letter_encountered / 3 if self.total_upper_letter_encountered > self.total_lower_letter_encountered * 0.4 else 0
         z_ = UnicodeRangeIdentify.unravel_suspicious_ranges(len(self._string), self.encountered_unicode_range_occurrences)
         p_ = self.encountered_punc_sign if self.encountered_punc_sign / len(self._string) > 0.2 else 0
-        return ((r_ + p_ + self.successive_upper_lower + self.successive_accent + self.successive_different_unicode_range + self.not_encountered_white_space + self.unprintable + z_ + ProbeChaos._unravel_cjk_suspicious_chinese.__func__(self._string, self.encountered_unicode_range_occurrences)) / len(self._string)) + self._probe_word.ratio  # + len(self.encountered_unicode_range)-1
+
+        bonus_sig_bom = -int(len(self._string)*0.5) if self._bonus_bom_sig is True else 0
+
+        initial_ratio = ((r_ + p_ + q_ + self.successive_upper_lower + self.successive_accent + self.successive_different_unicode_range + self.not_encountered_white_space + self.unprintable + z_ + bonus_sig_bom + ProbeChaos._unravel_cjk_suspicious_chinese.__func__(self._string, self.encountered_unicode_range_occurrences)) / len(self._string)) + self._probe_word.ratio  # + len(self.encountered_unicode_range)-1
+
+        return initial_ratio / 1.3 if self._bonus_multi_byte is True and initial_ratio > 0. else initial_ratio
diff --git a/charset_normalizer/probe_coherence.py b/charset_normalizer/probe_coherence.py
@@ -1,5 +1,6 @@
 # coding: utf-8
 import json
+import statistics
 from collections import Counter
 from functools import lru_cache
 from os.path import dirname, realpath, exists
@@ -85,7 +86,7 @@ def ratio(self):
 
         ratios = [self.rank_per_lang[lg] for lg in languages]
 
-        return sum(ratios) / 2 if self.non_latin_covered_any is True else sum(ratios)
+        return statistics.mean(ratios) / 2 if self.non_latin_covered_any is True else statistics.mean(ratios)
 
     @property
     def coverage(self):

diff --git a/charset_normalizer/unicode.py b/charset_normalizer/unicode.py
@@ -92,6 +92,8 @@ def unravel_suspicious_ranges(str_len, encountered_unicode_range_occurrences):
         items = encountered_unicode_range_occurrences.items()
         s_ = 0
 
+        # print(encountered_unicode_range_occurrences)
+
         for k, v in items:
             k_ = k.lower()
             if (
@@ -101,7 +103,10 @@ def unravel_suspicious_ranges(str_len, encountered_unicode_range_occurrences):
                         continue
                     if 'halfwidth and fullwidth forms' in k_ and any(['CJK' in el for el in encountered_unicode_range_occurrences.keys()]):
                         continue
-                    s_ += v if 'geometric shapes' not in k_ else v * 10
+                    if 'hiragana' in k_ or 'katakana' in k_:
+                        continue
+                    # print('suspicious', k_, 'with', v)
+                    s_ += v
 
         return s_
 

diff --git a/setup.py b/setup.py
@@ -13,13 +13,14 @@
 EMAIL = '[email protected]'
 AUTHOR = 'Ahmed TAHRI @Ousret'
 REQUIRES_PYTHON = '>=3.5.0'
-VERSION = '1.1.1'
+VERSION = '1.2.0'
 
 REQUIRED = [
     'cached_property',
     'dragonmapper',
     'zhon',
-    'prettytable'
+    'prettytable',
+    'loguru'
 ]
 
 EXTRAS = {

diff --git a/test/test_cli.py b/test/test_cli.py
@@ -1,6 +1,8 @@
 import unittest
 from charset_normalizer.cli.normalizer import cli_detect, query_yes_no
 from unittest.mock import patch
+from os.path import exists
+from os import remove
 
 
 class TestCommandLineInterface(unittest.TestCase):
@@ -26,6 +28,23 @@ def test_single_file(self):
             )
         )
 
+    def test_single_file_normalize(self):
+        self.assertEqual(
+            0,
+            cli_detect(
+                ['./data/sample.1.ar.srt', '--normalize']
+            )
+        )
+
+        self.assertTrue(
+            exists('./data/sample.1.ar.cp1256.srt')
+        )
+
+        try:
+            remove('./data/sample.1.ar.cp1256.srt')
+        except:
+            pass
+
     def test_single_verbose_file(self):
         self.assertEqual(
             0,