-
Notifications
You must be signed in to change notification settings - Fork 19
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
utf8 option fails to convert some subtitles #37
Comments
to make this a warning: --- a/subdl.py
+++ b/subdl.py
@@ -312,8 +312,11 @@
if not result["encoding"] in {"ascii", "utf-8"}:
print(f"Found encoding {result['encoding']} with a confidence of {result['confidence']*100:.2f}%. Converting to utf8.")
# separate lines for easier debugging
- s = s.decode(result["encoding"]) # bytes -> str
- s = s.encode("utf8") # str -> bytes
+ try:
+ s = s.decode(result["encoding"]).encode("utf8") # bytes -> str -> bytes
+ except UnicodeDecodeError as err:
+ print(f"failed to convert {destfilename}: {err}")
+ # keep original encoding of file
writefile(destfilename, s)
print("done, wrote %d bytes."% (len(s)), file=sys.stderr) |
to use libmagic instead of chardet: --- a/subdl.py
+++ b/subdl.py
@@ -307,13 +307,16 @@
if options.filter:
s = filtersub(s)
if options.utf8:
- import chardet
- result = chardet.detect(s)
- if not result["encoding"] in {"ascii", "utf-8"}:
- print(f"Found encoding {result['encoding']} with a confidence of {result['confidence']*100:.2f}%. Converting to utf8.")
+ import magic
+ result = magic.detect_from_content(s)
+ if not result.encoding in {"us-ascii", "utf-8"}:
+ print(f"Found encoding {result['encoding']}. Converting to utf8.")
# separate lines for easier debugging
- s = s.decode(result["encoding"]) # bytes -> str
- s = s.encode("utf8") # str -> bytes
+ try:
+ s = s.decode(result.encoding).encode("utf8") # bytes -> str -> bytes
+ except UnicodeDecodeError as err:
+ print(f"failed to convert to utf8: {destfilename}: {err}")
+ # keep original encoding of file
writefile(destfilename, s)
print("done, wrote %d bytes."% (len(s)), file=sys.stderr)
@@ -436,9 +439,9 @@
elif option == '--utf8':
options.utf8 = True
try:
- import chardet
+ import magic
except ModuleNotFoundError:
- sys.stderr.write("Error: The --utf8 option requires the chardet module from https://pypi.org/project/chardet/ - Hint: pip install chardet\n")
+ sys.stderr.write("Error: The --utf8 option requires the python-magic module from https://pypi.org/project/python-magic/ - Hint: pip install python-magic\n")
sys.exit(1)
elif option == '--list-languages':
ListLanguages() --- a/subdl.py
+++ b/subdl.py
@@ -310,12 +310,12 @@
import magic
result = magic.detect_from_content(s)
if not result.encoding in {"us-ascii", "utf-8"}:
- print(f"Found encoding {result['encoding']}. Converting to utf8.")
+ print(f"Found encoding {result.encoding}. Converting to utf8.")
# separate lines for easier debugging
try:
s = s.decode(result.encoding).encode("utf8") # bytes -> str -> bytes
except UnicodeDecodeError as err:
- print(f"failed to convert to utf8: {destfilename}: {err}")
+ print(f"failed to convert to utf8 from {result.encoding}: {destfilename}: {err}")
# keep original encoding of file
writefile(destfilename, s)
print("done, wrote %d bytes."% (len(s)), file=sys.stderr) --- a/subdl.py
+++ b/subdl.py
@@ -309,7 +309,7 @@
if options.utf8:
import magic
result = magic.detect_from_content(s)
- if not result.encoding in {"us-ascii", "utf-8"}:
+ if not result.encoding in {"us-ascii", "utf-8", "unknown-8bit", "binary"}:
print(f"Found encoding {result.encoding}. Converting to utf8.")
# separate lines for easier debugging
try: |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Irreversible.2002.DVDRip.XviD.AC3-DK.EN.srt
https://www.opensubtitles.org/en/subtitles/3431287/irreversible-en
"johab" sounds weird. lets try latin1
success! so its a bug in chardet ...
todo: workaround: when conversion to utf8 fails, keep the original file, show a warning, rename the result file to
$basename.noutf8.$extension
, for exampleIrreversible.2002.en.1952041941.noutf8.srt
The text was updated successfully, but these errors were encountered: