Add modular TMDb-first movie pipeline and Discord bot

2026-04-21 21:43:35 +02:00
commit bbba110268
13 changed files with 1283 additions and 0 deletions
--- a/data_sources/anilist_source.py
+++ b/data_sources/anilist_source.py
@@ -0,0 +1,176 @@
+#!/usr/bin/env python3
+from __future__ import annotations
+
+import json
+import urllib.error
+import urllib.request
+from datetime import date
+
+ANILIST_URL = "https://graphql.anilist.co"
+
+
+def normalize_title(text: str) -> str:
+    cleaned = "".join(ch.lower() if ch.isalnum() else " " for ch in (text or ""))
+    return " ".join(cleaned.split())
+
+
+def safe_date(sd: dict | None) -> date | None:
+    if not sd:
+        return None
+    year = sd.get("year") or 0
+    month = sd.get("month") or 0
+    day = sd.get("day") or 0
+    if year <= 0 or month <= 0:
+        return None
+    if day <= 0:
+        day = 1
+    try:
+        return date(year, month, day)
+    except ValueError:
+        return None
+
+
+def post_graphql(query: str, variables: dict) -> dict:
+    payload = json.dumps({"query": query, "variables": variables}).encode("utf-8")
+    headers = {
+        "Content-Type": "application/json",
+        "Accept": "application/json",
+        "User-Agent": "anime-movies-script/1.0",
+    }
+    req = urllib.request.Request(ANILIST_URL, data=payload, headers=headers)
+    try:
+        with urllib.request.urlopen(req, timeout=30) as resp:
+            return json.loads(resp.read().decode("utf-8"))
+    except urllib.error.HTTPError as exc:
+        body = exc.read().decode("utf-8", "replace")
+        raise RuntimeError(f"HTTP {exc.code}: {body}") from exc
+
+
+def pick_best_title(title: dict) -> str:
+    english = (title.get("english") or "").strip()
+    if english:
+        return english
+    romaji = (title.get("romaji") or "").strip()
+    if romaji:
+        return romaji
+    native = (title.get("native") or "").strip()
+    return native or ""
+
+
+def map_anilist_media(media: dict | None) -> dict:
+    media = media or {}
+    title = media.get("title") or {}
+    studios = ((media.get("studios") or {}).get("nodes") or [])
+    studio_names = [((node or {}).get("name") or "").strip() for node in studios]
+    studio_names = [name for name in studio_names if name]
+
+    genres = [str(item).strip() for item in (media.get("genres") or [])]
+    genres = [g for g in genres if g]
+
+    tags_raw = media.get("tags") or []
+    tags = []
+    for tag in tags_raw:
+        if not isinstance(tag, dict):
+            continue
+        name = (tag.get("name") or "").strip()
+        rank = int(tag.get("rank") or 0)
+        if name:
+            tags.append({"name": name, "rank": rank})
+
+    mapped = {
+        "id": media.get("id"),
+        "title_best": pick_best_title(title),
+        "title_english": (title.get("english") or "").strip(),
+        "title_romaji": (title.get("romaji") or "").strip(),
+        "title_native": (title.get("native") or "").strip(),
+        "start_date": safe_date(media.get("startDate")),
+        "format": (media.get("format") or "").strip(),
+        "episodes": media.get("episodes"),
+        "duration": media.get("duration"),
+        "source": str(media.get("source") or "").replace("_", " ").title(),
+        "description": (media.get("description") or "").strip(),
+        "genres": genres,
+        "genres_text": ", ".join(genres) if genres else "",
+        "tags": tags,
+        "tags_text": ", ".join(tag["name"] for tag in tags if tag["rank"] >= 70) or "",
+        "studio_names": studio_names,
+        "studio_text": ", ".join(studio_names) if studio_names else "",
+        "anilist_url": (media.get("siteUrl") or "").strip(),
+        "cover_image": (((media.get("coverImage") or {}).get("large")) or "").strip(),
+        "raw": media,
+    }
+    return mapped
+
+
+def fetch_anilist_movie_by_search(search_text: str, cache: dict[str, dict | None]) -> dict | None:
+    key = normalize_title(search_text)
+    if key in cache:
+        return cache[key]
+
+    query = """
+    query ($search: String, $perPage: Int) {
+      Page(page: 1, perPage: $perPage) {
+        media(type: ANIME, format: MOVIE, search: $search, sort: [SEARCH_MATCH, POPULARITY_DESC]) {
+          id
+          title { english romaji native }
+          startDate { year month day }
+          format
+          episodes
+          duration
+          source
+          description(asHtml: false)
+          genres
+          tags { name rank }
+          coverImage { large }
+          studios { nodes { name } }
+          siteUrl
+        }
+      }
+    }
+    """
+
+    try:
+        data = post_graphql(query, {"search": search_text, "perPage": 5})
+    except Exception:
+        cache[key] = None
+        return None
+
+    if "errors" in data:
+        cache[key] = None
+        return None
+
+    candidates = data.get("data", {}).get("Page", {}).get("media", [])
+    if not candidates:
+        cache[key] = None
+        return None
+
+    wanted = normalize_title(search_text)
+    best = None
+    best_score = -1
+    for media in candidates:
+        title = media.get("title") or {}
+        options = [title.get("english"), title.get("romaji"), title.get("native")]
+        score = 0
+        for option in options:
+            normalized = normalize_title(str(option or ""))
+            if not normalized:
+                continue
+            if normalized == wanted:
+                score = max(score, 3)
+            elif wanted and (wanted in normalized or normalized in wanted):
+                score = max(score, 2)
+            elif normalized.split(" ")[:2] == wanted.split(" ")[:2]:
+                score = max(score, 1)
+        if score > best_score:
+            best_score = score
+            best = media
+            if score == 3:
+                break
+
+    if best_score <= 0:
+        cache[key] = None
+        return None
+
+    mapped = map_anilist_media(best) if best else None
+    cache[key] = mapped
+    return mapped