""" Matching-Orchestrator: - Erkennt deutsche Hörbuch-Serien (die drei ???, TKKG, ...) - Versucht MusicBrainz → OpenLibrary → Google Books → DNB - Lädt Cover herunter - Bewertet Konfidenz und entscheidet über Auto-Accept """ import re import os import logging import httpx import asyncio from pathlib import Path from datetime import datetime from sqlalchemy.ext.asyncio import AsyncSession from sqlalchemy import select from ..config import get_settings from ..models.media_item import LibraryItem, BookFile, Chapter from ..models.library import Library from ..models.session import ServerSetting from ..database import AsyncSessionLocal from .matching.base import MatchResult from .matching.musicbrainz import search_musicbrainz, get_release_details from .matching.open_library import search_open_library, get_work_details from .matching.google_books import search_google_books from .matching.dnb import search_dnb logger = logging.getLogger(__name__) AUTO_ACCEPT_THRESHOLD = 0.65 UNCERTAIN_THRESHOLD = 0.40 SERIES_PATTERNS = [ (r"(?i)^(die drei \?\?\?|die drei fragezeichen|drei fragezeichen)\s*[-–]?\s*(?:folge\s*)?(\d+)", "Die drei ???"), (r"(?i)^(tkkg)\s*[-–]?\s*(?:folge\s*)?(\d+)", "TKKG"), (r"(?i)^(fünf freunde|funf freunde)\s*[-–]?\s*(?:band\s*)?(\d+)", "Fünf Freunde"), (r"(?i)^(bibi blocksberg)\s*[-–]?\s*(?:folge\s*)?(\d+)", "Bibi Blocksberg"), (r"(?i)^(benjamin blümchen|benjamin blumchen)\s*[-–]?\s*(?:folge\s*)?(\d+)", "Benjamin Blümchen"), (r"(?i)^(bibi und tina)\s*[-–]?\s*(?:folge\s*)?(\d+)", "Bibi und Tina"), (r"(?i)^(der kleine vampir)\s*[-–]?\s*(?:band\s*)?(\d+)", "Der kleine Vampir"), (r"(?i)^(.+?)\s*[-–]\s*(?:folge|band|teil|nr\.?|#)\s*(\d+)", None), (r"(?i)^(.+?)\s*\((?:folge|band|teil|nr\.?|#|episode)\s*(\d+)\)", None), ] def detect_series(title: str) -> tuple[str | None, str | None]: for pattern, canonical_name in SERIES_PATTERNS: m = re.match(pattern, title.strip()) if m: series = canonical_name or m.group(1).strip() episode = m.group(2) return series, episode return None, None def _title_similarity(a: str, b: str) -> float: """Wort-Überlapp mit Min/Max-Gewichtung — lenient für Teil-Treffer.""" if not a or not b: return 0.0 wa = set(re.findall(r"\w+", a.lower())) wb = set(re.findall(r"\w+", b.lower())) if not wa or not wb: return 0.0 intersect = len(wa & wb) if intersect == 0: return 0.0 smaller = min(len(wa), len(wb)) larger = max(len(wa), len(wb)) return 0.7 * (intersect / smaller) + 0.3 * (intersect / larger) def _score_result(result: MatchResult, query_title: str, query_author: str | None) -> float: score = result.confidence title_sim = _title_similarity(result.title, query_title) score = score * 0.3 + title_sim * 0.7 if query_author and result.author: author_sim = _title_similarity(result.author, query_author) score = score * 0.7 + author_sim * 0.3 return min(score, 1.0) def _enrich_match(best: MatchResult, details: MatchResult) -> MatchResult: """Befüllt leere Felder in best mit Werten aus details. Beschreibung/Kapitel werden bevorzugt aus details übernommen.""" if details.description: best.description = details.description if details.chapters and not best.chapters: best.chapters = details.chapters for attr in ( "subtitle", "narrator", "cover_url", "publisher", "publish_year", "series", "series_sequence", "language", ): val = getattr(details, attr, None) if val and not getattr(best, attr, None): setattr(best, attr, val) if details.genres: existing = set(best.genres or []) best.genres = (best.genres or []) + [g for g in details.genres if g not in existing] return best async def _download_cover(url: str, item_id: str) -> str | None: settings = get_settings() ext = ".jpg" if ".png" in url.lower(): ext = ".png" dest = os.path.join(settings.covers_dir, f"{item_id}{ext}") logger.info(f"Cover-Download: {url}") try: async with httpx.AsyncClient(timeout=20, follow_redirects=True) as client: r = await client.get(url) if r.status_code == 200 and len(r.content) > 1000: os.makedirs(settings.covers_dir, exist_ok=True) with open(dest, "wb") as f: f.write(r.content) logger.info(f"Cover gespeichert: {dest} ({len(r.content)} Bytes)") return dest else: logger.warning(f"Cover-Download HTTP {r.status_code}, size={len(r.content)}: {url}") except Exception as e: logger.warning(f"Cover-Download Fehler ({url}): {e}") return None async def _apply_match(db: AsyncSession, item: LibraryItem, result: MatchResult, confidence: float): logger.info( f"Apply match: item={item.id} title={result.title!r} author={result.author!r} " f"narrator={result.narrator!r} publisher={result.publisher!r} year={result.publish_year} " f"series={result.series!r}/{result.series_sequence} cover={bool(result.cover_url)} " f"chapters={len(result.chapters or [])} confidence={confidence:.2f}" ) if result.title: item.title = result.title if result.subtitle and not item.subtitle: item.subtitle = result.subtitle if result.author: item.author = result.author if result.narrator: item.narrator = result.narrator if result.description: item.description = result.description if result.publisher: item.publisher = result.publisher if result.publish_year: item.publish_year = result.publish_year if result.language: item.language = result.language if result.genres: item.genres = result.genres if result.series: item.series = result.series if result.series_sequence: item.series_sequence = result.series_sequence item.matched_source = result.source item.matched_id = result.source_id item.match_confidence = confidence item.updated_at = datetime.utcnow() if result.cover_url and not item.cover_path: cover_path = await _download_cover(result.cover_url, item.id) if cover_path: item.cover_path = cover_path elif not result.cover_url: logger.info(f"Kein Cover-URL in Match-Ergebnis ({result.source}: {result.source_id})") if result.chapters: from sqlalchemy import delete await db.execute(delete(Chapter).where(Chapter.library_item_id == item.id)) for idx, ch in enumerate(result.chapters): chapter = Chapter( library_item_id=item.id, chapter_index=idx, title=ch.get("title", f"Kapitel {idx + 1}"), start_seconds=ch.get("start", 0.0), end_seconds=ch.get("end", 0.0), ) db.add(chapter) if confidence >= AUTO_ACCEPT_THRESHOLD: tags = item.tags or [] item.tags = [t for t in tags if t != "zu_prüfen"] _SOURCE_FUNCS = { "musicbrainz": (search_musicbrainz, get_release_details), "open_library": (search_open_library, get_work_details), "google_books": (search_google_books, None), "dnb": (search_dnb, None), } async def match_audiobook(item_id: str): async with AsyncSessionLocal() as db: result_row = await db.execute(select(LibraryItem).where(LibraryItem.id == item_id)) item = result_row.scalar_one_or_none() if not item or item.match_locked: return setting = await db.execute( select(ServerSetting).where(ServerSetting.key == "autoMatchBooks") ) s = setting.scalar_one_or_none() if s and s.value is False: return lib_row = await db.execute(select(Library).where(Library.id == item.library_id)) lib = lib_row.scalar_one_or_none() sources: list[str] = ( (lib.settings or {}).get("match_sources", list(_SOURCE_FUNCS.keys())) if lib else list(_SOURCE_FUNCS.keys()) ) title = item.title or "" author = item.author series, episode = detect_series(title) search_title = title if series: search_title = f"{series} {episode}" if episode else series if not item.series: item.series = series if not item.series_sequence and episode: item.series_sequence = episode logger.info(f"Matche: '{title}' (Such-Titel: '{search_title}') | Quellen: {sources}") best: MatchResult | None = None best_score = 0.0 for source_name in sources: if best_score >= AUTO_ACCEPT_THRESHOLD: break funcs = _SOURCE_FUNCS.get(source_name) if not funcs: continue search_func, details_func = funcs try: results = await search_func(search_title, author) logger.info(f"{source_name}: {len(results)} Treffer") local_best: MatchResult | None = None local_score = 0.0 for r in results: score = _score_result(r, title, author) logger.info(f" → {r.title!r} ({r.author!r}) score={score:.2f}") if score > local_score: local_score = score local_best = r if local_best and local_score > best_score: best_score = local_score best = local_best if details_func and local_score >= UNCERTAIN_THRESHOLD: try: details = await details_func(local_best.source_id) if details: _enrich_match(best, details) logger.info(f"{source_name}: Details geladen für {local_best.source_id}") except Exception as e: logger.warning(f"{source_name} Details Fehler: {e}") except Exception as e: logger.warning(f"{source_name} Fehler: {e}") if best and best_score >= UNCERTAIN_THRESHOLD: try: await _apply_match(db, item, best, best_score) await db.commit() logger.info(f"Match angewendet: '{item.title}' ← {best.source} ({best_score:.2f})") except Exception as e: logger.error(f"_apply_match fehlgeschlagen für '{title}': {e}", exc_info=True) else: logger.info(f"Kein Match für '{title}' (beste Konfidenz: {best_score:.2f}, Schwelle: {UNCERTAIN_THRESHOLD})") await db.commit() async def search_for_item(title: str, author: str | None = None) -> list[dict]: """Suche über alle Quellen – für manuelles Matching. Gibt alle relevanten Felder zurück.""" async def _search_source(coro): try: return await coro except Exception as e: logger.warning(f"Such-Fehler: {e}") return [] mb, ol, gb, dnb = await asyncio.gather( _search_source(search_musicbrainz(title, author)), _search_source(search_open_library(title, author)), _search_source(search_google_books(title, author)), _search_source(search_dnb(title, author)), ) results = [] for r in mb + ol + gb + dnb: results.append({ "source": r.source, "id": r.source_id, "title": r.title, "subtitle": r.subtitle, "author": r.author, "narrator": r.narrator, "description": r.description, "publisher": r.publisher, "publishYear": r.publish_year, "series": r.series, "seriesSequence": r.series_sequence, "language": r.language, "genres": r.genres, "cover": r.cover_url, "confidence": r.confidence, }) results.sort(key=lambda x: x["confidence"], reverse=True) logger.info(f"Manuelle Suche '{title}' (author={author!r}): {len(results)} Treffer total") return results