Skip to content

Commit

Permalink
❇️ Improve the detection around some cases (#366)
Browse files Browse the repository at this point in the history
Close #365 #357 #356
  • Loading branch information
Ousret authored Oct 19, 2023
1 parent 49653a6 commit 66966f1
Show file tree
Hide file tree
Showing 6 changed files with 24 additions and 11 deletions.
8 changes: 7 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,12 @@
All notable changes to charset-normalizer will be documented in this file. This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).

## [3.3.1](https://github.com/Ousret/charset_normalizer/compare/3.3.0...3.3.1) (2023-10-??)

### Changed
- Optional mypyc compilation upgraded to version 1.6.0 for Python >= 3.8
- Improved the general detection reliability based on reports from the community

## [3.3.0](https://github.com/Ousret/charset_normalizer/compare/3.2.0...3.3.0) (2023-09-30)

### Added
Expand All @@ -14,7 +20,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).

### Changed
- (internal) Unicode code blocks in constants are updated using the latest v15.0.0 definition to improve detection
- Optional mypyc compilation upgraded to version 1.5.1 for Python >= 3.7
- Optional mypyc compilation upgraded to version 1.5.1 for Python >= 3.8

### Fixed
- Unable to properly sort CharsetMatch when both chaos/noise and coherence were close due to an unreachable condition in \_\_lt\_\_ (#350)
Expand Down
4 changes: 3 additions & 1 deletion bin/coverage.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from typing import List
import argparse

from charset_normalizer import from_path
from charset_normalizer import from_path, __version__
from charset_normalizer.utils import iana_name

from os import sep
Expand Down Expand Up @@ -40,6 +40,8 @@ def cli_coverage(arguments: List[str]):
print("This script require https://github.com/Ousret/char-dataset to be cloned on package root directory")
exit(1)

print(f"> using charset-normalizer {__version__}")

success_count = 0
total_count = 0

Expand Down
13 changes: 8 additions & 5 deletions charset_normalizer/md.py
Original file line number Diff line number Diff line change
Expand Up @@ -233,16 +233,13 @@ def reset(self) -> None: # pragma: no cover

@property
def ratio(self) -> float:
if self._character_count == 0:
if self._character_count <= 24:
return 0.0

ratio_of_suspicious_range_usage: float = (
self._suspicious_successive_range_count * 2
) / self._character_count

if ratio_of_suspicious_range_usage < 0.1:
return 0.0

return ratio_of_suspicious_range_usage


Expand Down Expand Up @@ -295,7 +292,11 @@ def feed(self, character: str) -> None:
self._is_current_word_bad = True
# Word/Buffer ending with an upper case accentuated letter are so rare,
# that we will consider them all as suspicious. Same weight as foreign_long suspicious.
if is_accentuated(self._buffer[-1]) and self._buffer[-1].isupper():
if (
is_accentuated(self._buffer[-1])
and self._buffer[-1].isupper()
and all(_.isupper() for _ in self._buffer) is False
):
self._foreign_long_count += 1
self._is_current_word_bad = True
if buffer_length >= 24 and self._foreign_long_watch:
Expand Down Expand Up @@ -521,6 +522,8 @@ def is_suspiciously_successive_range(
return False
if "Forms" in unicode_range_a or "Forms" in unicode_range_b:
return False
if unicode_range_a == "Basic Latin" or unicode_range_b == "Basic Latin":
return False

return True

Expand Down
4 changes: 2 additions & 2 deletions charset_normalizer/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ def is_symbol(character: str) -> bool:
if character_range is None:
return False

return "Forms" in character_range
return "Forms" in character_range and character_category != "Lo"


@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
Expand All @@ -106,7 +106,7 @@ def is_emoticon(character: str) -> bool:
if character_range is None:
return False

return "Emoticons" in character_range
return "Emoticons" in character_range or "Pictographs" in character_range


@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
Expand Down
2 changes: 1 addition & 1 deletion charset_normalizer/version.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,5 @@
Expose version
"""

__version__ = "3.3.0"
__version__ = "3.3.1"
VERSION = __version__.split(".")
4 changes: 3 additions & 1 deletion tests/test_edge_case.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
from charset_normalizer import from_bytes
import pytest
import platform


@pytest.mark.xfail(platform.python_version_tuple()[0] == "3" and platform.python_version_tuple()[1] == "7", reason="Unicode database is too old for this case (Python 3.7)")
def test_unicode_edge_case():
payload = b'\xef\xbb\xbf\xf0\x9f\xa9\xb3'

Expand Down

0 comments on commit 66966f1

Please sign in to comment.