diff --git a/scrapers/JacquieEtMichelTV/JacquieEtMichelTV.py b/scrapers/JacquieEtMichelTV/JacquieEtMichelTV.py index f2fb2ff7e..257949e5d 100644 --- a/scrapers/JacquieEtMichelTV/JacquieEtMichelTV.py +++ b/scrapers/JacquieEtMichelTV/JacquieEtMichelTV.py @@ -3,67 +3,63 @@ import json import sys from py_common import log -from py_common.types import ScrapedPerformer, ScrapedScene, ScrapedStudio, ScrapedTag -from py_common.util import scraper_args +from py_common.types import ScrapedScene +from py_common.util import scraper_args, dig from py_common.deps import ensure_requirements ensure_requirements("cloudscraper", "lxml") -import cloudscraper -from lxml import html +import cloudscraper # noqa: E402 +from lxml import html # noqa: E402 +scraper = cloudscraper.create_scraper() -def scene_from_url(url: str) -> ScrapedScene: - scene = ScrapedScene() - scraper = cloudscraper.create_scraper() +def scene_from_url(url: str) -> ScrapedScene | None: try: scraped = scraper.get(url) scraped.raise_for_status() except Exception as ex: - log.error(f"Error getting URL: {ex}") - sys.exit(1) + log.error(f"Error scraping {url}: {ex}") + return None tree = html.fromstring(scraped.text) - video_data = None - video_data_elems = tree.xpath("//script[@type='application/ld+json']") - for d in video_data_elems: + for d in tree.xpath("//script[@type='application/ld+json']"): if '"@type": "VideoObject"' in d.text: video_data = json.loads(d.text)[0] break - if not video_data: - log.error("No VideoObject data found.") - sys.exit(1) + else: + log.error(f"No VideoObject data found at {url}") + return None - scene = ScrapedScene({ + log.debug(f"Video data: {json.dumps(video_data)}") + scene: ScrapedScene = { "title": video_data["name"], "details": video_data["description"], - "studio": ScrapedStudio(name=video_data["productionCompany"]), - "tags": [ScrapedTag(name=t) for t in video_data["keywords"].split(",")], - "performers": [ScrapedPerformer(name=a["name"]) for a in video_data["actor"]] - }) + "date": dt.fromisoformat(video_data["datePublished"]).date().isoformat(), + "tags": [{"name": t} for t in video_data["keywords"].split(",")], + } - # If no performers look in zeder elem - if not scene["performers"]: - try: - zeder_elem = tree.xpath("//div[contains(@class, '-zeder-detail-')]")[0] - zeder_attrs = zeder_elem.attrib - for k, v in zeder_attrs.items(): - if "data-zeder-actor-" in k: - scene["performers"].append(ScrapedPerformer(name=v.replace("-", " ").title())) - except IndexError: - pass + if studio := dig(video_data, "productionCompany", "en"): + scene["studio"] = {"name": studio} - scene["date"] = dt.fromisoformat(video_data["datePublished"]).strftime("%Y-%m-%d") + if dig(video_data, "actor"): + scene["performers"] = [{"name": a["name"]} for a in video_data["actor"]] + # Performers are also listed in the data-zeder-actor-* attributes + # but they do not have accented characters and are not capitalized + elif actors := tree.xpath("/html/body/div/@*[contains(name(), 'zeder-actor-')]"): + scene["performers"] = [{"name": a.replace("-", " ").title()} for a in actors] - image_url = tree.xpath("//meta[@property='og:image']/@content")[0] - try: - img = scraper.get(image_url).content - scraped.raise_for_status() - scene["image"] = "data:image/jpeg;base64," + base64.b64encode(img).decode() - except Exception as ex: - log.error(f"Failed to get image: {ex}") + for image_url in tree.xpath("//meta[@property='og:image']/@content"): + try: + img = scraper.get(image_url) + img.raise_for_status() + scene["image"] = ( + "data:image/jpeg;base64," + base64.b64encode(img.content).decode() + ) + except Exception as ex: + log.error(f"Failed to get image from {image_url}: {ex}") return scene @@ -75,7 +71,9 @@ def scene_from_url(url: str) -> ScrapedScene: case "scene-by-url", {"url": url} if url: result = scene_from_url(url) case _: - log.error(f"Not Implemented: Operation: {op}, arguments: {json.dumps(args)}") + log.error( + f"Not Implemented: Operation: {op}, arguments: {json.dumps(args)}" + ) sys.exit(1) print(json.dumps(result))