Skip to content

Commit

Permalink
Release 1.2.0 (#16)
Browse files Browse the repository at this point in the history
* Add method to determine if specific encoding is multi byte

* Add has_submatch property on a match

* Disallow calling __eq__ on different object type

* Add percent_chaos and percent_coherence

Better readability on chaos and coherence instead of ratio between 0. and 1.

* Coherence ratio based on mean instead of sum of best results

* Using loguru for trace/debug <3

* best() method rewrited

* from_byte method improved

  - new parameters
  - debug available with loguru
  - probe chaos improved for hiragana and katakana

* Experimental, hook on UnicodeDecodeError

Provide encoding detection on decoding error.

* bump 1.2.0

add loguru dep+

* Add test CLI normalize file without replacing it
  • Loading branch information
Ousret authored Sep 28, 2019
1 parent 5abfb83 commit 6ea66b2
Show file tree
Hide file tree
Showing 10 changed files with 227 additions and 62 deletions.
1 change: 1 addition & 0 deletions charset_normalizer/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,4 @@
from charset_normalizer.probe_coherence import ProbeCoherence
from charset_normalizer.probe_words import ProbeWords
from charset_normalizer.legacy import detect
from charset_normalizer.hook import charset_normalizer_hook
3 changes: 3 additions & 0 deletions charset_normalizer/constant.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@
Scrapped from https://unicode-table.com/
"""
from codecs import BOM_UTF8, BOM_UTF16_BE, BOM_UTF16_LE, BOM_UTF32_BE, BOM_UTF32_LE
from _multibytecodec import MultibyteIncrementalDecoder

MULTI_BYTE_DECODER = MultibyteIncrementalDecoder

UNICODE_RANGES = [
"0000−001F",
Expand Down
15 changes: 15 additions & 0 deletions charset_normalizer/encoding.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
from charset_normalizer.constant import MULTI_BYTE_DECODER
import importlib


def is_multi_byte_encoding(encoding_name):
"""
Verify is a specific encoding is a multi byte one based on it IANA name
:param str encoding_name: IANA encoding name
:return: True if multi byte
:rtype: bool
"""
return issubclass(
importlib.import_module('encodings.{encoding_name}'.format(encoding_name=encoding_name)).IncrementalDecoder,
MULTI_BYTE_DECODER
)
14 changes: 14 additions & 0 deletions charset_normalizer/hook.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
import sys
from charset_normalizer.legacy import detect


def charset_normalizer_hook(exctype, value, traceback):
if exctype == UnicodeDecodeError:
cp_detection = detect(value.object)
if cp_detection['encoding'] is not None:
value.reason = value.reason+'; you may want to consider {} codec for this sequence.'.format(cp_detection['encoding'])

sys.__excepthook__(exctype, value, traceback)


sys.excepthook = charset_normalizer_hook
197 changes: 142 additions & 55 deletions charset_normalizer/normalizer.py

Large diffs are not rendered by default.

25 changes: 22 additions & 3 deletions charset_normalizer/probe_chaos.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,12 @@
@lru_cache(maxsize=8192)
class ProbeChaos:

def __init__(self, string, giveup_threshold=0.09):
def __init__(self, string, giveup_threshold=0.09, bonus_bom_sig=False, bonus_multi_byte=False):
"""
:param str string:
:param float giveup_threshold: When to give up even if _probe has not finished yet
:param bool bonus_bom_sig: Decide if ratio should take in consideration a bonus because of BOM/SIG
:param bool bonus_multi_byte: Decide if ratio should take in consideration a bonus because of multi byte scheme decoder
"""

if not isinstance(string, str):
Expand All @@ -26,6 +28,9 @@ def __init__(self, string, giveup_threshold=0.09):
self._string = string
self._threshold = giveup_threshold

self._bonus_bom_sig = bonus_bom_sig
self._bonus_multi_byte = bonus_multi_byte

self.successive_upper_lower = 0
self.successive_accent = 0
self.successive_different_unicode_range = 0
Expand All @@ -46,14 +51,18 @@ def __init__(self, string, giveup_threshold=0.09):
self.total_letter_encountered = 0

self.total_lower_letter_encountered = 0
self.total_upper_letter_encountered = 0

self.total_upper_accent_encountered = 0
self.total_upper_accent_encountered_inner = 0

self.total_unaccented_letter_encountered = 0

self._probe_word = ProbeWords(HashableCounter(self._string.split()))

self.gave_up = False

# Artificially increase string size to get more significant result.
if 32 > len(self._string) > 0:
self._string *= int(32 / len(self._string)) + 1

Expand Down Expand Up @@ -165,6 +174,9 @@ def _probe(self):
if is_lower:
self.total_lower_letter_encountered += 1

if is_upper:
self.total_upper_letter_encountered += 1

if is_upper and is_accent:
self.total_upper_accent_encountered += 1
if self.previous_printable_letter.isalpha():
Expand Down Expand Up @@ -237,7 +249,14 @@ def ratio(self):
:return: Ratio as floating number
:rtype: float
"""
r_ = self.total_upper_accent_encountered if self.total_letter_encountered > 0 and self.total_unaccented_letter_encountered / self.total_letter_encountered < 0.5 else 0

r_ = self.total_upper_accent_encountered if self.total_unaccented_letter_encountered / self.total_letter_encountered < 0.5 else 0
q_ = self.total_upper_letter_encountered / 3 if self.total_upper_letter_encountered > self.total_lower_letter_encountered * 0.4 else 0
z_ = UnicodeRangeIdentify.unravel_suspicious_ranges(len(self._string), self.encountered_unicode_range_occurrences)
p_ = self.encountered_punc_sign if self.encountered_punc_sign / len(self._string) > 0.2 else 0
return ((r_ + p_ + self.successive_upper_lower + self.successive_accent + self.successive_different_unicode_range + self.not_encountered_white_space + self.unprintable + z_ + ProbeChaos._unravel_cjk_suspicious_chinese.__func__(self._string, self.encountered_unicode_range_occurrences)) / len(self._string)) + self._probe_word.ratio # + len(self.encountered_unicode_range)-1

bonus_sig_bom = -int(len(self._string)*0.5) if self._bonus_bom_sig is True else 0

initial_ratio = ((r_ + p_ + q_ + self.successive_upper_lower + self.successive_accent + self.successive_different_unicode_range + self.not_encountered_white_space + self.unprintable + z_ + bonus_sig_bom + ProbeChaos._unravel_cjk_suspicious_chinese.__func__(self._string, self.encountered_unicode_range_occurrences)) / len(self._string)) + self._probe_word.ratio # + len(self.encountered_unicode_range)-1

return initial_ratio / 1.3 if self._bonus_multi_byte is True and initial_ratio > 0. else initial_ratio
3 changes: 2 additions & 1 deletion charset_normalizer/probe_coherence.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# coding: utf-8
import json
import statistics
from collections import Counter
from functools import lru_cache
from os.path import dirname, realpath, exists
Expand Down Expand Up @@ -85,7 +86,7 @@ def ratio(self):

ratios = [self.rank_per_lang[lg] for lg in languages]

return sum(ratios) / 2 if self.non_latin_covered_any is True else sum(ratios)
return statistics.mean(ratios) / 2 if self.non_latin_covered_any is True else statistics.mean(ratios)

@property
def coverage(self):
Expand Down
7 changes: 6 additions & 1 deletion charset_normalizer/unicode.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,8 @@ def unravel_suspicious_ranges(str_len, encountered_unicode_range_occurrences):
items = encountered_unicode_range_occurrences.items()
s_ = 0

# print(encountered_unicode_range_occurrences)

for k, v in items:
k_ = k.lower()
if (
Expand All @@ -101,7 +103,10 @@ def unravel_suspicious_ranges(str_len, encountered_unicode_range_occurrences):
continue
if 'halfwidth and fullwidth forms' in k_ and any(['CJK' in el for el in encountered_unicode_range_occurrences.keys()]):
continue
s_ += v if 'geometric shapes' not in k_ else v * 10
if 'hiragana' in k_ or 'katakana' in k_:
continue
# print('suspicious', k_, 'with', v)
s_ += v

return s_

Expand Down
5 changes: 3 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,14 @@
EMAIL = '[email protected]'
AUTHOR = 'Ahmed TAHRI @Ousret'
REQUIRES_PYTHON = '>=3.5.0'
VERSION = '1.1.1'
VERSION = '1.2.0'

REQUIRED = [
'cached_property',
'dragonmapper',
'zhon',
'prettytable'
'prettytable',
'loguru'
]

EXTRAS = {
Expand Down
19 changes: 19 additions & 0 deletions test/test_cli.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import unittest
from charset_normalizer.cli.normalizer import cli_detect, query_yes_no
from unittest.mock import patch
from os.path import exists
from os import remove


class TestCommandLineInterface(unittest.TestCase):
Expand All @@ -26,6 +28,23 @@ def test_single_file(self):
)
)

def test_single_file_normalize(self):
self.assertEqual(
0,
cli_detect(
['./data/sample.1.ar.srt', '--normalize']
)
)

self.assertTrue(
exists('./data/sample.1.ar.cp1256.srt')
)

try:
remove('./data/sample.1.ar.cp1256.srt')
except:
pass

def test_single_verbose_file(self):
self.assertEqual(
0,
Expand Down

0 comments on commit 6ea66b2

Please sign in to comment.