Add modular TMDb-first movie pipeline and Discord bot

2026-04-21 21:43:35 +02:00
commit bbba110268
13 changed files with 1283 additions and 0 deletions
--- a/data_sources/animeschedule_source.py
+++ b/data_sources/animeschedule_source.py
@@ -0,0 +1,261 @@
+#!/usr/bin/env python3
+from __future__ import annotations
+
+import json
+import urllib.parse
+import urllib.request
+
+ANIMESCHEDULE_API_BASE = "https://animeschedule.net/api/v3"
+
+
+def normalize_title(text: str) -> str:
+    cleaned = "".join(ch.lower() if ch.isalnum() else " " for ch in (text or ""))
+    return " ".join(cleaned.split())
+
+
+def flatten_items(payload: dict | list) -> list[dict]:
+    if isinstance(payload, list):
+        return [item for item in payload if isinstance(item, dict)]
+    if not isinstance(payload, dict):
+        return []
+
+    for key in ("anime", "items", "data", "results"):
+        value = payload.get(key)
+        if isinstance(value, list):
+            return [item for item in value if isinstance(item, dict)]
+    return []
+
+
+def extract_names(item: dict) -> list[str]:
+    names = []
+    direct = [item.get("title"), item.get("name"), item.get("romaji"), item.get("english"), item.get("native")]
+    for candidate in direct:
+        text = (candidate or "").strip()
+        if text:
+            names.append(text)
+
+    nested = item.get("names") or {}
+    if isinstance(nested, dict):
+        for candidate in nested.values():
+            if isinstance(candidate, list):
+                for value in candidate:
+                    text = (str(value) if value is not None else "").strip()
+                    if text:
+                        names.append(text)
+                continue
+
+            text = (str(candidate) if candidate is not None else "").strip()
+            if text:
+                names.append(text)
+
+    unique = []
+    seen = set()
+    for name in names:
+        lowered = name.lower()
+        if lowered in seen:
+            continue
+        seen.add(lowered)
+        unique.append(name)
+    return unique
+
+
+def extract_url(item: dict) -> str:
+    slug = (item.get("slug") or item.get("route") or item.get("id") or "").strip()
+    if slug:
+        return f"https://animeschedule.net/anime/{slug}"
+
+    websites = item.get("websites") or item.get("links") or []
+    if isinstance(websites, list):
+        for entry in websites:
+            if not isinstance(entry, dict):
+                continue
+            url = (entry.get("url") or "").strip()
+            if url:
+                return url
+    return ""
+
+
+def extract_english_title(item: dict) -> str:
+    direct = [item.get("english"), item.get("titleEnglish"), item.get("englishTitle")]
+    for candidate in direct:
+        text = (candidate or "").strip()
+        if text:
+            return text
+
+    names = item.get("names") or {}
+    if isinstance(names, dict):
+        for key, value in names.items():
+            key_norm = str(key).strip().lower()
+            if key_norm in {"en", "eng", "english", "titleenglish"}:
+                text = (str(value) if value is not None else "").strip()
+                if text:
+                    return text
+    return ""
+
+
+def extract_list(item: dict, *keys: str) -> str:
+    for key in keys:
+        value = item.get(key)
+        if not value:
+            continue
+        if isinstance(value, str):
+            text = value.strip()
+            if text:
+                return text
+        if isinstance(value, list):
+            names = []
+            for entry in value:
+                if isinstance(entry, str):
+                    text = entry.strip()
+                elif isinstance(entry, dict):
+                    text = (entry.get("name") or entry.get("title") or "").strip()
+                else:
+                    text = ""
+                if text:
+                    names.append(text)
+            if names:
+                return ", ".join(names)
+    return ""
+
+
+def extract_format(item: dict) -> str:
+    for key in ("format", "mediaType", "type"):
+        value = item.get(key)
+        text = (str(value) if value is not None else "").strip()
+        if text:
+            return text
+    return ""
+
+
+def extract_description(item: dict) -> str:
+    for key in ("description", "synopsis", "overview", "summary"):
+        value = item.get(key)
+        text = (str(value) if value is not None else "").strip()
+        if text:
+            return text
+
+    details = item.get("details") or {}
+    if isinstance(details, dict):
+        for key in ("description", "synopsis", "overview", "summary"):
+            value = details.get(key)
+            text = (str(value) if value is not None else "").strip()
+            if text:
+                return text
+    return ""
+
+
+def empty_result() -> dict:
+    return {
+        "url": "",
+        "title": "",
+        "english_title": "",
+        "format": "",
+        "genres": "",
+        "studios": "",
+        "description": "",
+        "names": [],
+        "match_score": 0,
+        "raw": {},
+    }
+
+
+def title_match_score(wanted: str, names: list[str]) -> int:
+    wanted_norm = normalize_title(wanted)
+    if not wanted_norm:
+        return 0
+
+    best = 0
+    wanted_parts = wanted_norm.split(" ")
+
+    for candidate in names:
+        normalized = normalize_title(candidate)
+        if not normalized:
+            continue
+
+        if normalized == wanted_norm:
+            best = max(best, 4)
+            continue
+
+        if wanted_norm in normalized or normalized in wanted_norm:
+            best = max(best, 3)
+            continue
+
+        normalized_parts = normalized.split(" ")
+        if normalized_parts[:2] == wanted_parts[:2] and len(normalized_parts) >= 2 and len(wanted_parts) >= 2:
+            best = max(best, 2)
+            continue
+
+        if set(normalized_parts) & set(wanted_parts):
+            best = max(best, 1)
+
+    return best
+
+
+def fetch_animeschedule_anime_by_title(title: str, token: str, cache: dict[str, dict]) -> dict:
+    key = normalize_title(title)
+    if key in cache:
+        return cache[key]
+
+    if not token:
+        cache[key] = empty_result()
+        return cache[key]
+
+    params = {"search": title, "take": "10"}
+    url = ANIMESCHEDULE_API_BASE + "/anime?" + urllib.parse.urlencode(params)
+    req = urllib.request.Request(
+        url,
+        headers={
+            "Accept": "application/json",
+            "Authorization": f"Bearer {token}",
+            "User-Agent": "anime-movies-script/1.0",
+        },
+    )
+
+    try:
+        with urllib.request.urlopen(req, timeout=20) as resp:
+            payload = json.loads(resp.read().decode("utf-8"))
+    except Exception:
+        cache[key] = empty_result()
+        return cache[key]
+
+    candidates = flatten_items(payload)
+    if not candidates:
+        cache[key] = empty_result()
+        return cache[key]
+
+    best = None
+    best_score = -1
+
+    for item in candidates:
+        names = extract_names(item)
+        if not names:
+            continue
+
+        score = title_match_score(title, names)
+        if score <= 0:
+            continue
+
+        if score > best_score:
+            best_score = score
+            best = item
+            if score == 4:
+                break
+
+    if not best:
+        cache[key] = empty_result()
+        return cache[key]
+
+    names = extract_names(best)
+    cache[key] = {
+        "url": extract_url(best),
+        "title": names[0] if names else "",
+        "english_title": extract_english_title(best),
+        "format": extract_format(best),
+        "genres": extract_list(best, "genres", "genre", "categories"),
+        "studios": extract_list(best, "studios", "studio"),
+        "description": extract_description(best),
+        "names": names,
+        "match_score": best_score,
+        "raw": best,
+    }
+    return cache[key]