Skip to content

Commit

Permalink
Update Jacquie Et Michel for their new multilingual studio objects
Browse files Browse the repository at this point in the history
  • Loading branch information
Maista6969 committed Jan 11, 2025
1 parent 5d74a66 commit 55f1137
Showing 1 changed file with 37 additions and 39 deletions.
76 changes: 37 additions & 39 deletions scrapers/JacquieEtMichelTV/JacquieEtMichelTV.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,67 +3,63 @@
import json
import sys
from py_common import log
from py_common.types import ScrapedPerformer, ScrapedScene, ScrapedStudio, ScrapedTag
from py_common.util import scraper_args
from py_common.types import ScrapedScene
from py_common.util import scraper_args, dig
from py_common.deps import ensure_requirements

ensure_requirements("cloudscraper", "lxml")

import cloudscraper
from lxml import html
import cloudscraper # noqa: E402
from lxml import html # noqa: E402

scraper = cloudscraper.create_scraper()

def scene_from_url(url: str) -> ScrapedScene:
scene = ScrapedScene()

scraper = cloudscraper.create_scraper()
def scene_from_url(url: str) -> ScrapedScene | None:
try:
scraped = scraper.get(url)
scraped.raise_for_status()
except Exception as ex:
log.error(f"Error getting URL: {ex}")
sys.exit(1)
log.error(f"Error scraping {url}: {ex}")
return None

tree = html.fromstring(scraped.text)

video_data = None
video_data_elems = tree.xpath("//script[@type='application/ld+json']")
for d in video_data_elems:
for d in tree.xpath("//script[@type='application/ld+json']"):
if '"@type": "VideoObject"' in d.text:
video_data = json.loads(d.text)[0]
break
if not video_data:
log.error("No VideoObject data found.")
sys.exit(1)
else:
log.error(f"No VideoObject data found at {url}")
return None

scene = ScrapedScene({
log.debug(f"Video data: {json.dumps(video_data)}")
scene: ScrapedScene = {
"title": video_data["name"],
"details": video_data["description"],
"studio": ScrapedStudio(name=video_data["productionCompany"]),
"tags": [ScrapedTag(name=t) for t in video_data["keywords"].split(",")],
"performers": [ScrapedPerformer(name=a["name"]) for a in video_data["actor"]]
})
"date": dt.fromisoformat(video_data["datePublished"]).date().isoformat(),
"tags": [{"name": t} for t in video_data["keywords"].split(",")],
}

# If no performers look in zeder elem
if not scene["performers"]:
try:
zeder_elem = tree.xpath("//div[contains(@class, '-zeder-detail-')]")[0]
zeder_attrs = zeder_elem.attrib
for k, v in zeder_attrs.items():
if "data-zeder-actor-" in k:
scene["performers"].append(ScrapedPerformer(name=v.replace("-", " ").title()))
except IndexError:
pass
if studio := dig(video_data, "productionCompany", "en"):
scene["studio"] = {"name": studio}

scene["date"] = dt.fromisoformat(video_data["datePublished"]).strftime("%Y-%m-%d")
if dig(video_data, "actor"):
scene["performers"] = [{"name": a["name"]} for a in video_data["actor"]]
# Performers are also listed in the data-zeder-actor-* attributes
# but they do not have accented characters and are not capitalized
elif actors := tree.xpath("/html/body/div/@*[contains(name(), 'zeder-actor-')]"):
scene["performers"] = [{"name": a.replace("-", " ").title()} for a in actors]

image_url = tree.xpath("//meta[@property='og:image']/@content")[0]
try:
img = scraper.get(image_url).content
scraped.raise_for_status()
scene["image"] = "data:image/jpeg;base64," + base64.b64encode(img).decode()
except Exception as ex:
log.error(f"Failed to get image: {ex}")
for image_url in tree.xpath("//meta[@property='og:image']/@content"):
try:
img = scraper.get(image_url)
img.raise_for_status()
scene["image"] = (
"data:image/jpeg;base64," + base64.b64encode(img.content).decode()
)
except Exception as ex:
log.error(f"Failed to get image from {image_url}: {ex}")

return scene

Expand All @@ -75,7 +71,9 @@ def scene_from_url(url: str) -> ScrapedScene:
case "scene-by-url", {"url": url} if url:
result = scene_from_url(url)
case _:
log.error(f"Not Implemented: Operation: {op}, arguments: {json.dumps(args)}")
log.error(
f"Not Implemented: Operation: {op}, arguments: {json.dumps(args)}"
)
sys.exit(1)

print(json.dumps(result))

0 comments on commit 55f1137

Please sign in to comment.