Skip to content

Commit

Permalink
Adapt chunk_size and steps according to sequence len (#15)
Browse files Browse the repository at this point in the history
* Adapt chunk_size and steps according to sequence len

* Add pragma no coverage for frequencies json generator

* Add test for CLI query_yes_no and verbose output

* Reset path level for unittest

* bump 1.1.1
  • Loading branch information
Ousret authored Sep 23, 2019
1 parent 38c77fa commit 5abfb83
Show file tree
Hide file tree
Showing 4 changed files with 46 additions and 12 deletions.
30 changes: 22 additions & 8 deletions charset_normalizer/normalizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from encodings.aliases import aliases
from os.path import basename, splitext
from platform import python_version_tuple
from warnings import warn

from cached_property import cached_property

Expand Down Expand Up @@ -254,6 +255,7 @@ def __len__(self):
@staticmethod
def normalize(path, steps=10, chunk_size=512, threshold=0.20):
"""
:param str path:
:param int steps:
:param int chunk_size:
Expand Down Expand Up @@ -296,19 +298,31 @@ def from_bytes(sequences, steps=10, chunk_size=512, threshold=0.20):
:return: List of potential matches
:rtype: CharsetNormalizerMatches
"""
py_v = [int(el) for el in python_version_tuple()]
py_need_sort = py_v[0] < 3 or (py_v[0] == 3 and py_v[1] < 6)

supported = sorted(aliases.items()) if py_need_sort else aliases.items()
too_small_sequence = len(sequences) < 24

tested = set()
matches = list()
if too_small_sequence is True:
warn('Trying to detect encoding from a tiny portion of ({}) bytes.'.format(len(sequences)))

maximum_length = len(sequences)

# Adjust steps and chunk_size when content is just too small for it
if maximum_length <= (chunk_size * steps):
steps = 1

if maximum_length <= chunk_size:
chunk_size = maximum_length
steps = 1
elif steps > 1 and maximum_length / steps < chunk_size:
chunk_size = int(maximum_length / steps)

# Bellow Python 3.6, Expect dict to not behave the same.
py_v = [int(el) for el in python_version_tuple()]
py_need_sort = py_v[0] < 3 or (py_v[0] == 3 and py_v[1] < 6)

supported = collections.OrderedDict(aliases).items() if py_need_sort else aliases.items()

tested = set()
matches = list()

for support in supported:

Expand Down Expand Up @@ -360,7 +374,7 @@ def from_bytes(sequences, steps=10, chunk_size=512, threshold=0.20):
# chaos_max = max(ratios)

if (len(r_) >= 4 and nb_gave_up > len(r_) / 4) or chaos_median > threshold:
# print(p, 'is too much chaos for decoded input !')
# print(p, 'is too much chaos for decoded input !', nb_gave_up, chaos_median)
continue

encountered_unicode_range_occurrences = dict()
Expand Down Expand Up @@ -396,7 +410,7 @@ def from_bytes(sequences, steps=10, chunk_size=512, threshold=0.20):
)
)

# print(p, nb_gave_up, chaos_means, chaos_median, chaos_min, chaos_max, matches[-1].coherence, matches[-1].languages,)
# print(p, nb_gave_up, chaos_means, chaos_median, matches[-1].coherence, matches[-1].languages,)

if (p == 'ascii' and chaos_median == 0.) or bom_available is True:
return CharsetNormalizerMatches([matches[-1]])
Expand Down
3 changes: 1 addition & 2 deletions charset_normalizer/probe_coherence.py
Original file line number Diff line number Diff line change
Expand Up @@ -231,7 +231,7 @@ def _verify_order_on(target_alphabet_ordered, character_occurrences, distance_ma
return n_not_rightfully_ranked / n_tested, n_tested, n_tested_verified

@staticmethod
def frequencies_json(minimum_char_count=45000000, save_to_file=True, proxies=None):
def frequencies_json(minimum_char_count=45000000, save_to_file=True, proxies=None): # pragma: no cover
"""
This method refresh or create frequencies.json at will.
Don't abuse it as it perform HTTP GET query
Expand Down Expand Up @@ -291,4 +291,3 @@ def frequencies_json(minimum_char_count=45000000, save_to_file=True, proxies=Non
with open('{}/frequencies.json'.format(ProbeCoherence.ASSETS_PATH) if exists('{}/frequencies.json'.format(
ProbeCoherence.ASSETS_PATH)) else './charset_normalizer/assets/frequencies.json', 'w', encoding='utf-8') as fp:
json.dump(ProbeCoherence.FREQUENCIES, fp)

2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
EMAIL = '[email protected]'
AUTHOR = 'Ahmed TAHRI @Ousret'
REQUIRES_PYTHON = '>=3.5.0'
VERSION = '1.1.0'
VERSION = '1.1.1'

REQUIRED = [
'cached_property',
Expand Down
23 changes: 22 additions & 1 deletion test/test_cli.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,22 @@
import unittest
from charset_normalizer.cli.normalizer import cli_detect
from charset_normalizer.cli.normalizer import cli_detect, query_yes_no
from unittest.mock import patch


class TestCommandLineInterface(unittest.TestCase):

@patch('builtins.input', lambda *args: 'y')
def test_simple_yes_input(self):
self.assertTrue(
query_yes_no('Are u willing to chill a little bit ?')
)

@patch('builtins.input', lambda *args: 'N')
def test_simple_no_input(self):
self.assertFalse(
query_yes_no('Are u willing to chill a little bit ?')
)

def test_single_file(self):

self.assertEqual(
Expand All @@ -13,6 +26,14 @@ def test_single_file(self):
)
)

def test_single_verbose_file(self):
self.assertEqual(
0,
cli_detect(
['./data/sample.1.ar.srt', '--verbose']
)
)

def test_multiple_file(self):
self.assertEqual(
0,
Expand Down

0 comments on commit 5abfb83

Please sign in to comment.