From 3793b76b997d4d6e9520254a129dd941e555bba4 Mon Sep 17 00:00:00 2001 From: Jacob Chapman <7908073+chapmanjacobd@users.noreply.github.com> Date: Wed, 23 Oct 2024 16:01:16 +0000 Subject: [PATCH] lsar: replace control chars on UnicodeDecodeError --- xklb/createdb/site_add.py | 4 ++-- xklb/mediafiles/media_check.py | 4 ++-- xklb/mediafiles/process_media.py | 8 +++++++- xklb/scratch/mam_search.py | 6 +++--- xklb/text/json_keys_rename.py | 4 ++-- xklb/utils/arg_utils.py | 8 ++++---- xklb/utils/processes.py | 23 +++++++++++++++-------- xklb/utils/strings.py | 18 +++++++++++++++++- 8 files changed, 52 insertions(+), 23 deletions(-) diff --git a/xklb/createdb/site_add.py b/xklb/createdb/site_add.py index 91a266f1..bffcd05a 100644 --- a/xklb/createdb/site_add.py +++ b/xklb/createdb/site_add.py @@ -1,4 +1,4 @@ -import argparse, json +import argparse from collections import defaultdict from io import StringIO @@ -220,7 +220,7 @@ def response_interceptor(request, response): if any(s in body for s in ["searchKeywords"]): return - body = json.loads(body) + body = strings.safe_json_loads(body) tables = nosql_to_sql(body) elif args.extract_html and response.headers["Content-Type"].startswith(("text/html",)): diff --git a/xklb/mediafiles/media_check.py b/xklb/mediafiles/media_check.py index aebd35d0..f053dbd3 100644 --- a/xklb/mediafiles/media_check.py +++ b/xklb/mediafiles/media_check.py @@ -1,4 +1,4 @@ -import fractions, json, os, shlex, subprocess, tempfile +import fractions, os, shlex, subprocess, tempfile from concurrent.futures import ThreadPoolExecutor, as_completed from pathlib import Path from shutil import which @@ -144,7 +144,7 @@ def decode_full_scan(path, audio_scan=False, frames="frames", threads=None): ] r_frames = processes.cmd(*ffprobe_cmd) - data = json.loads(r_frames.stdout)["streams"][0] + data = strings.safe_json_loads(r_frames.stdout)["streams"][0] r_frame_rate = fractions.Fraction(data["r_frame_rate"]) nb_frames = int(data.get(f"nb_read_{frames}") or 0) diff --git a/xklb/mediafiles/process_media.py b/xklb/mediafiles/process_media.py index 18040726..0175748e 100644 --- a/xklb/mediafiles/process_media.py +++ b/xklb/mediafiles/process_media.py @@ -2,6 +2,8 @@ from contextlib import suppress from shutil import which +import concurrent.futures + from xklb import usage from xklb.mediadb import db_history from xklb.mediafiles import process_ffmpeg, process_image, process_text @@ -252,7 +254,11 @@ def process_media() -> None: args = parse_args() media = collect_media(args) - media = iterables.conform(check_shrink(args, m) for m in media) + mp_args = argparse.Namespace(**{k: v for k, v in args.__dict__.items() if k not in {"db"}}) + with concurrent.futures.ThreadPoolExecutor() as executor: # mostly for lsar but also ffprobe + futures = {executor.submit(check_shrink, mp_args, m) for m in media} + media = iterables.conform(v.result() for v in futures) + media = sorted( media, key=lambda d: d["savings"] / (d["processing_time"] or args.transcoding_image_time), reverse=True ) diff --git a/xklb/scratch/mam_search.py b/xklb/scratch/mam_search.py index 68d1cc42..ea565412 100644 --- a/xklb/scratch/mam_search.py +++ b/xklb/scratch/mam_search.py @@ -1,7 +1,7 @@ -import argparse, json +import argparse from sqlite3 import IntegrityError -from xklb.utils import arggroups, argparse_utils, nums, objects, web +from xklb.utils import arggroups, argparse_utils, nums, objects, strings, web from xklb.utils.log_utils import log @@ -62,7 +62,7 @@ def get_page(args, query_data): df = pd.DataFrame(data) df = df.drop(columns=["cat", "language", "category", "main_cat", "browseflags", "comments", "owner", "leechers"]) - safe_json = objects.fallback(json.loads, {}) + safe_json = objects.fallback(strings.safe_json_loads, {}) def dict_values_str(d): return ", ".join(d.values()) diff --git a/xklb/text/json_keys_rename.py b/xklb/text/json_keys_rename.py index 6862b8d5..d512da6c 100644 --- a/xklb/text/json_keys_rename.py +++ b/xklb/text/json_keys_rename.py @@ -1,7 +1,7 @@ import json, sys from xklb import usage -from xklb.utils import arg_utils, arggroups, argparse_utils, printing, processes +from xklb.utils import arg_utils, arggroups, argparse_utils, printing, processes, strings from xklb.utils.log_utils import log @@ -30,7 +30,7 @@ def rename_keys(json_data, key_mapping): def gen_d(line): - json_data = json.loads(line) + json_data = strings.safe_json_loads(line) if isinstance(json_data, list): yield from json_data elif isinstance(json_data, dict): diff --git a/xklb/utils/arg_utils.py b/xklb/utils/arg_utils.py index caa65592..68062c90 100644 --- a/xklb/utils/arg_utils.py +++ b/xklb/utils/arg_utils.py @@ -1,9 +1,9 @@ -import argparse, json, operator, random +import argparse, operator, random from collections import defaultdict from copy import copy from pathlib import Path -from xklb.utils import consts, file_utils, iterables, nums, processes +from xklb.utils import consts, file_utils, iterables, nums, processes, strings from xklb.utils.consts import SC @@ -13,7 +13,7 @@ def gen_paths(args, default_exts=None): if args.from_json: for path in args.paths: - json_data = json.loads(path) + json_data = strings.safe_json_loads(path) if isinstance(json_data, list): yield from (d["path"] for d in json_data) elif isinstance(json_data, dict): @@ -36,7 +36,7 @@ def gen_d(args, default_exts=None): if args.from_json: for path in args.paths: - json_data = json.loads(path) + json_data = strings.safe_json_loads(path) if isinstance(json_data, list): yield from json_data elif isinstance(json_data, dict): diff --git a/xklb/utils/processes.py b/xklb/utils/processes.py index a0fe50db..42bdb411 100644 --- a/xklb/utils/processes.py +++ b/xklb/utils/processes.py @@ -4,7 +4,7 @@ from shutil import which from typing import NoReturn -from xklb.utils import consts, iterables, nums, path_utils +from xklb.utils import consts, iterables, nums, path_utils, strings from xklb.utils.log_utils import log @@ -309,7 +309,7 @@ def __init__(self, path, *args): raise OSError else: raise UnplayableFile(out, err) - d = json.loads(out.decode("utf-8")) + d = strings.safe_json_loads(out.decode("utf-8")) self.path = path @@ -373,10 +373,12 @@ def lsar(archive_path): log.error("[%s]: The 'lsar' command is not available. Install 'unar' to check archives", archive_path) return [] - lsar_output = subprocess.run(["lsar", "-json", archive_path], stdout=subprocess.PIPE, stderr=subprocess.PIPE) - try: - lsar_json = json.loads(lsar_output.stdout) + lsar_output = cmd("lsar", "-json", archive_path, error_verbosity=2) + except subprocess.CalledProcessError: + return [] + try: + lsar_json = strings.safe_json_loads(lsar_output.stdout) except json.JSONDecodeError: log.warning("[%s]: Error parsing lsar output as JSON: %s", archive_path, lsar_output) return [] @@ -415,9 +417,14 @@ def unar_delete(archive_path): cmd("unar", "-quiet", "-force-rename", "-no-directory", "-output-directory", output_path, archive_path) path_utils.folder_utime(output_path, (original_stats.st_atime, original_stats.st_mtime)) - lsar_json = cmd("lsar", "-json", archive_path).stdout - lsar_output = json.loads(lsar_json) - part_files = lsar_output["lsarProperties"]["XADVolumes"] + lsar_output = cmd("lsar", "-json", archive_path) + try: + lsar_json = strings.safe_json_loads(lsar_output.stdout) + except json.JSONDecodeError: + log.warning("[%s]: Error parsing lsar output as JSON: %s", archive_path, lsar_output) + return + + part_files = lsar_json["lsarProperties"]["XADVolumes"] try: for part_file in part_files: diff --git a/xklb/utils/strings.py b/xklb/utils/strings.py index a436ed75..08571aa3 100644 --- a/xklb/utils/strings.py +++ b/xklb/utils/strings.py @@ -1,4 +1,4 @@ -import functools, html, math, operator, re, sys, textwrap +import functools, html, json, math, operator, re, sys, textwrap from copy import deepcopy from datetime import datetime, timedelta from datetime import timezone as tz @@ -11,6 +11,22 @@ from xklb.utils.log_utils import log +def safe_json_loads(s): + if isinstance(s, bytes): + return safe_json_loads(s.decode("utf-8", errors="replace")) + try: + return json.loads(s) + except json.JSONDecodeError: + # try replacing control chars + return json.loads(re.sub(r"[\x00-\x1f\x7f-\x9f]", "", s)) + + +def safe_json_load(path): + with open(path, "rb") as file: + binary_data = file.read() + return safe_json_loads(binary_data.decode("utf-8", errors="replace")) + + def repeat_until_same(fn): # noqa: ANN201 def wrapper(*args, **kwargs): p = args[0]