Skip to content

Commit

Permalink
lsar: replace control chars on UnicodeDecodeError
Browse files Browse the repository at this point in the history
  • Loading branch information
chapmanjacobd committed Oct 23, 2024
1 parent 7731c59 commit 3793b76
Show file tree
Hide file tree
Showing 8 changed files with 52 additions and 23 deletions.
4 changes: 2 additions & 2 deletions xklb/createdb/site_add.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import argparse, json
import argparse
from collections import defaultdict
from io import StringIO

Expand Down Expand Up @@ -220,7 +220,7 @@ def response_interceptor(request, response):
if any(s in body for s in ["searchKeywords"]):
return

body = json.loads(body)
body = strings.safe_json_loads(body)
tables = nosql_to_sql(body)

elif args.extract_html and response.headers["Content-Type"].startswith(("text/html",)):
Expand Down
4 changes: 2 additions & 2 deletions xklb/mediafiles/media_check.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import fractions, json, os, shlex, subprocess, tempfile
import fractions, os, shlex, subprocess, tempfile
from concurrent.futures import ThreadPoolExecutor, as_completed
from pathlib import Path
from shutil import which
Expand Down Expand Up @@ -144,7 +144,7 @@ def decode_full_scan(path, audio_scan=False, frames="frames", threads=None):
]

r_frames = processes.cmd(*ffprobe_cmd)
data = json.loads(r_frames.stdout)["streams"][0]
data = strings.safe_json_loads(r_frames.stdout)["streams"][0]

r_frame_rate = fractions.Fraction(data["r_frame_rate"])
nb_frames = int(data.get(f"nb_read_{frames}") or 0)
Expand Down
8 changes: 7 additions & 1 deletion xklb/mediafiles/process_media.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
from contextlib import suppress
from shutil import which

import concurrent.futures

from xklb import usage
from xklb.mediadb import db_history
from xklb.mediafiles import process_ffmpeg, process_image, process_text
Expand Down Expand Up @@ -252,7 +254,11 @@ def process_media() -> None:
args = parse_args()
media = collect_media(args)

media = iterables.conform(check_shrink(args, m) for m in media)
mp_args = argparse.Namespace(**{k: v for k, v in args.__dict__.items() if k not in {"db"}})
with concurrent.futures.ThreadPoolExecutor() as executor: # mostly for lsar but also ffprobe
futures = {executor.submit(check_shrink, mp_args, m) for m in media}
media = iterables.conform(v.result() for v in futures)

media = sorted(
media, key=lambda d: d["savings"] / (d["processing_time"] or args.transcoding_image_time), reverse=True
)
Expand Down
6 changes: 3 additions & 3 deletions xklb/scratch/mam_search.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import argparse, json
import argparse
from sqlite3 import IntegrityError

from xklb.utils import arggroups, argparse_utils, nums, objects, web
from xklb.utils import arggroups, argparse_utils, nums, objects, strings, web
from xklb.utils.log_utils import log


Expand Down Expand Up @@ -62,7 +62,7 @@ def get_page(args, query_data):
df = pd.DataFrame(data)
df = df.drop(columns=["cat", "language", "category", "main_cat", "browseflags", "comments", "owner", "leechers"])

safe_json = objects.fallback(json.loads, {})
safe_json = objects.fallback(strings.safe_json_loads, {})

def dict_values_str(d):
return ", ".join(d.values())
Expand Down
4 changes: 2 additions & 2 deletions xklb/text/json_keys_rename.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import json, sys

from xklb import usage
from xklb.utils import arg_utils, arggroups, argparse_utils, printing, processes
from xklb.utils import arg_utils, arggroups, argparse_utils, printing, processes, strings
from xklb.utils.log_utils import log


Expand Down Expand Up @@ -30,7 +30,7 @@ def rename_keys(json_data, key_mapping):


def gen_d(line):
json_data = json.loads(line)
json_data = strings.safe_json_loads(line)
if isinstance(json_data, list):
yield from json_data
elif isinstance(json_data, dict):
Expand Down
8 changes: 4 additions & 4 deletions xklb/utils/arg_utils.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
import argparse, json, operator, random
import argparse, operator, random
from collections import defaultdict
from copy import copy
from pathlib import Path

from xklb.utils import consts, file_utils, iterables, nums, processes
from xklb.utils import consts, file_utils, iterables, nums, processes, strings
from xklb.utils.consts import SC


Expand All @@ -13,7 +13,7 @@ def gen_paths(args, default_exts=None):

if args.from_json:
for path in args.paths:
json_data = json.loads(path)
json_data = strings.safe_json_loads(path)
if isinstance(json_data, list):
yield from (d["path"] for d in json_data)
elif isinstance(json_data, dict):
Expand All @@ -36,7 +36,7 @@ def gen_d(args, default_exts=None):

if args.from_json:
for path in args.paths:
json_data = json.loads(path)
json_data = strings.safe_json_loads(path)
if isinstance(json_data, list):
yield from json_data
elif isinstance(json_data, dict):
Expand Down
23 changes: 15 additions & 8 deletions xklb/utils/processes.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from shutil import which
from typing import NoReturn

from xklb.utils import consts, iterables, nums, path_utils
from xklb.utils import consts, iterables, nums, path_utils, strings
from xklb.utils.log_utils import log


Expand Down Expand Up @@ -309,7 +309,7 @@ def __init__(self, path, *args):
raise OSError
else:
raise UnplayableFile(out, err)
d = json.loads(out.decode("utf-8"))
d = strings.safe_json_loads(out.decode("utf-8"))

self.path = path

Expand Down Expand Up @@ -373,10 +373,12 @@ def lsar(archive_path):
log.error("[%s]: The 'lsar' command is not available. Install 'unar' to check archives", archive_path)
return []

lsar_output = subprocess.run(["lsar", "-json", archive_path], stdout=subprocess.PIPE, stderr=subprocess.PIPE)

try:
lsar_json = json.loads(lsar_output.stdout)
lsar_output = cmd("lsar", "-json", archive_path, error_verbosity=2)
except subprocess.CalledProcessError:
return []
try:
lsar_json = strings.safe_json_loads(lsar_output.stdout)
except json.JSONDecodeError:
log.warning("[%s]: Error parsing lsar output as JSON: %s", archive_path, lsar_output)
return []
Expand Down Expand Up @@ -415,9 +417,14 @@ def unar_delete(archive_path):
cmd("unar", "-quiet", "-force-rename", "-no-directory", "-output-directory", output_path, archive_path)
path_utils.folder_utime(output_path, (original_stats.st_atime, original_stats.st_mtime))

lsar_json = cmd("lsar", "-json", archive_path).stdout
lsar_output = json.loads(lsar_json)
part_files = lsar_output["lsarProperties"]["XADVolumes"]
lsar_output = cmd("lsar", "-json", archive_path)
try:
lsar_json = strings.safe_json_loads(lsar_output.stdout)
except json.JSONDecodeError:
log.warning("[%s]: Error parsing lsar output as JSON: %s", archive_path, lsar_output)
return

part_files = lsar_json["lsarProperties"]["XADVolumes"]

try:
for part_file in part_files:
Expand Down
18 changes: 17 additions & 1 deletion xklb/utils/strings.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import functools, html, math, operator, re, sys, textwrap
import functools, html, json, math, operator, re, sys, textwrap
from copy import deepcopy
from datetime import datetime, timedelta
from datetime import timezone as tz
Expand All @@ -11,6 +11,22 @@
from xklb.utils.log_utils import log


def safe_json_loads(s):
if isinstance(s, bytes):
return safe_json_loads(s.decode("utf-8", errors="replace"))
try:
return json.loads(s)
except json.JSONDecodeError:
# try replacing control chars
return json.loads(re.sub(r"[\x00-\x1f\x7f-\x9f]", "", s))


def safe_json_load(path):
with open(path, "rb") as file:
binary_data = file.read()
return safe_json_loads(binary_data.decode("utf-8", errors="replace"))


def repeat_until_same(fn): # noqa: ANN201
def wrapper(*args, **kwargs):
p = args[0]
Expand Down

0 comments on commit 3793b76

Please sign in to comment.