Rewrite player + fix matching metadata loss

Streaming: Custom range-aware HTTP endpoint. Returns 206 Partial Content for Range requests (with Content-Range, Content-Length, Accept-Ranges). This was the root cause of broken seeking — Starlette's default FileResponse behavior wasn't reliable across all clients. Now seeking works natively via standard HTML5 audio. Player: Full rewrite. Cleaner separation between absolute book time and per-track time. Track switching uses pendingSeek + canplay/loadedmetadata handlers. Console logs for debugging. Removed crossOrigin to avoid CORS issues. Removed hls.js entirely. Matcher: Critical bug fix — get_work_details (OpenLibrary) was returning a sparse MatchResult that REPLACED the rich search result, losing cover, author, year. New _enrich_match merges details into best without overwriting existing values (except description/chapters which are preferred from details fetch). Scoring: Lenient min/max-weighted similarity (better for German episodic titles like "Die drei ??? - Folge 215"). Thresholds lowered: UNCERTAIN 0.50→0.40, AUTO_ACCEPT 0.75→0.65. Search: search_for_item now returns ALL fields (narrator, publisher, series, genres, description, language) so manual apply has full data. Apply: apply_match now always constructs from body first, then enriches with details. Previously OL applies would lose cover/author. Added detailed logging across matcher and apply paths. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-26 18:02:13 +02:00
parent 6c702cb29f
commit 17b77afd45
4 changed files with 411 additions and 185 deletions
--- a/backend/app/services/matcher.py
+++ b/backend/app/services/matcher.py
@@ -1,7 +1,7 @@
 """
 Matching-Orchestrator:
 - Erkennt deutsche Hörbuch-Serien (die drei ???, TKKG, ...)
- Versucht MusicBrainz → OpenLibrary → Google Books
+- Versucht MusicBrainz → OpenLibrary → Google Books → DNB
 - Lädt Cover herunter
 - Bewertet Konfidenz und entscheidet über Auto-Accept
 """
@@ -28,10 +28,9 @@ from .matching.dnb import search_dnb

 logger = logging.getLogger(__name__)

-AUTO_ACCEPT_THRESHOLD = 0.75
-UNCERTAIN_THRESHOLD = 0.50
+AUTO_ACCEPT_THRESHOLD = 0.65
+UNCERTAIN_THRESHOLD = 0.40

-# Bekannte deutsche Hörbuch-Serien: (regex, kanonischer Name)
 SERIES_PATTERNS = [
    (r"(?i)^(die drei \?\?\?|die drei fragezeichen|drei fragezeichen)\s*[-–]?\s*(?:folge\s*)?(\d+)", "Die drei ???"),
    (r"(?i)^(tkkg)\s*[-–]?\s*(?:folge\s*)?(\d+)", "TKKG"),
@@ -40,15 +39,12 @@ SERIES_PATTERNS = [
    (r"(?i)^(benjamin blümchen|benjamin blumchen)\s*[-–]?\s*(?:folge\s*)?(\d+)", "Benjamin Blümchen"),
    (r"(?i)^(bibi und tina)\s*[-–]?\s*(?:folge\s*)?(\d+)", "Bibi und Tina"),
    (r"(?i)^(der kleine vampir)\s*[-–]?\s*(?:band\s*)?(\d+)", "Der kleine Vampir"),
-    # Generisch: "Serie - Folge/Band/Teil N - Titel"
    (r"(?i)^(.+?)\s*[-–]\s*(?:folge|band|teil|nr\.?|#)\s*(\d+)", None),
-    # Generisch: "Serie (Folge N)"
    (r"(?i)^(.+?)\s*\((?:folge|band|teil|nr\.?|#|episode)\s*(\d+)\)", None),
 ]


 def detect_series(title: str) -> tuple[str | None, str | None]:
-    """Gibt (Serienname, Folgennummer) zurück oder (None, None)."""
    for pattern, canonical_name in SERIES_PATTERNS:
        m = re.match(pattern, title.strip())
        if m:
@@ -59,52 +55,80 @@ def detect_series(title: str) -> tuple[str | None, str | None]:


 def _title_similarity(a: str, b: str) -> float:
-    """Einfache Ähnlichkeit: Wort-Überlapp."""
+    """Wort-Überlapp mit Min/Max-Gewichtung — lenient für Teil-Treffer."""
    if not a or not b:
        return 0.0
-    wa = set(re.findall(r'\w+', a.lower()))
-    wb = set(re.findall(r'\w+', b.lower()))
+    wa = set(re.findall(r"\w+", a.lower()))
+    wb = set(re.findall(r"\w+", b.lower()))
    if not wa or not wb:
        return 0.0
-    return len(wa & wb) / max(len(wa), len(wb))
+    intersect = len(wa & wb)
+    if intersect == 0:
+        return 0.0
+    smaller = min(len(wa), len(wb))
+    larger = max(len(wa), len(wb))
+    return 0.7 * (intersect / smaller) + 0.3 * (intersect / larger)


 def _score_result(result: MatchResult, query_title: str, query_author: str | None) -> float:
    score = result.confidence
    title_sim = _title_similarity(result.title, query_title)
-    score = score * 0.4 + title_sim * 0.6
+    score = score * 0.3 + title_sim * 0.7
    if query_author and result.author:
        author_sim = _title_similarity(result.author, query_author)
        score = score * 0.7 + author_sim * 0.3
    return min(score, 1.0)


+def _enrich_match(best: MatchResult, details: MatchResult) -> MatchResult:
+    """Befüllt leere Felder in best mit Werten aus details. Beschreibung/Kapitel werden bevorzugt aus details übernommen."""
+    if details.description:
+        best.description = details.description
+    if details.chapters and not best.chapters:
+        best.chapters = details.chapters
+    for attr in (
+        "subtitle", "narrator", "cover_url", "publisher",
+        "publish_year", "series", "series_sequence", "language",
+    ):
+        val = getattr(details, attr, None)
+        if val and not getattr(best, attr, None):
+            setattr(best, attr, val)
+    if details.genres:
+        existing = set(best.genres or [])
+        best.genres = (best.genres or []) + [g for g in details.genres if g not in existing]
+    return best
+
+
 async def _download_cover(url: str, item_id: str) -> str | None:
-    """Lädt Cover herunter und speichert es lokal."""
    settings = get_settings()
    ext = ".jpg"
-    if ".png" in url:
+    if ".png" in url.lower():
        ext = ".png"
    dest = os.path.join(settings.covers_dir, f"{item_id}{ext}")
    logger.info(f"Cover-Download: {url}")
    try:
        async with httpx.AsyncClient(timeout=20, follow_redirects=True) as client:
            r = await client.get(url)
-            if r.status_code == 200:
+            if r.status_code == 200 and len(r.content) > 1000:
                os.makedirs(settings.covers_dir, exist_ok=True)
                with open(dest, "wb") as f:
                    f.write(r.content)
                logger.info(f"Cover gespeichert: {dest} ({len(r.content)} Bytes)")
                return dest
            else:
-                logger.warning(f"Cover-Download HTTP {r.status_code}: {url}")
+                logger.warning(f"Cover-Download HTTP {r.status_code}, size={len(r.content)}: {url}")
    except Exception as e:
        logger.warning(f"Cover-Download Fehler ({url}): {e}")
    return None


 async def _apply_match(db: AsyncSession, item: LibraryItem, result: MatchResult, confidence: float):
-    """Schreibt Metadaten aus MatchResult in die DB."""
+    logger.info(
+        f"Apply match: item={item.id} title={result.title!r} author={result.author!r} "
+        f"narrator={result.narrator!r} publisher={result.publisher!r} year={result.publish_year} "
+        f"series={result.series!r}/{result.series_sequence} cover={bool(result.cover_url)} "
+        f"chapters={len(result.chapters or [])} confidence={confidence:.2f}"
+    )
    if result.title:
        item.title = result.title
    if result.subtitle and not item.subtitle:
@@ -133,7 +157,6 @@ async def _apply_match(db: AsyncSession, item: LibraryItem, result: MatchResult,
    item.match_confidence = confidence
    item.updated_at = datetime.utcnow()

-    # Cover herunterladen
    if result.cover_url and not item.cover_path:
        cover_path = await _download_cover(result.cover_url, item.id)
        if cover_path:
@@ -141,10 +164,8 @@ async def _apply_match(db: AsyncSession, item: LibraryItem, result: MatchResult,
    elif not result.cover_url:
        logger.info(f"Kein Cover-URL in Match-Ergebnis ({result.source}: {result.source_id})")

-    # Kapitel aus MusicBrainz-Tracklisting
    if result.chapters:
        from sqlalchemy import delete
-        from ..models.media_item import Chapter
        await db.execute(delete(Chapter).where(Chapter.library_item_id == item.id))
        for idx, ch in enumerate(result.chapters):
            chapter = Chapter(
@@ -156,7 +177,6 @@ async def _apply_match(db: AsyncSession, item: LibraryItem, result: MatchResult,
            )
            db.add(chapter)

-    # zu_prüfen entfernen wenn Konfidenz hoch genug
    if confidence >= AUTO_ACCEPT_THRESHOLD:
        tags = item.tags or []
        item.tags = [t for t in tags if t != "zu_prüfen"]
@@ -171,17 +191,12 @@ _SOURCE_FUNCS = {


 async def match_audiobook(item_id: str):
-    """
-    Haupt-Matching-Funktion. Wird nach dem Scan als Hintergrund-Task gestartet.
-    Quellen und Reihenfolge werden aus den Library-Settings gelesen.
-    """
    async with AsyncSessionLocal() as db:
        result_row = await db.execute(select(LibraryItem).where(LibraryItem.id == item_id))
        item = result_row.scalar_one_or_none()
        if not item or item.match_locked:
            return

-        # Globale Auto-Match Einstellung prüfen
        setting = await db.execute(
            select(ServerSetting).where(ServerSetting.key == "autoMatchBooks")
        )
@@ -189,7 +204,6 @@ async def match_audiobook(item_id: str):
        if s and s.value is False:
            return

-        # Matching-Quellen aus Library-Settings lesen
        lib_row = await db.execute(select(Library).where(Library.id == item.library_id))
        lib = lib_row.scalar_one_or_none()
        sources: list[str] = (
@@ -209,13 +223,13 @@ async def match_audiobook(item_id: str):
            if not item.series_sequence and episode:
                item.series_sequence = episode

-        logger.info(f"Matche: '{title}' | Quellen: {sources}")
+        logger.info(f"Matche: '{title}' (Such-Titel: '{search_title}') | Quellen: {sources}")

        best: MatchResult | None = None
        best_score = 0.0

        for source_name in sources:
-            if best_score >= UNCERTAIN_THRESHOLD:
+            if best_score >= AUTO_ACCEPT_THRESHOLD:
                break
            funcs = _SOURCE_FUNCS.get(source_name)
            if not funcs:
@@ -223,20 +237,26 @@ async def match_audiobook(item_id: str):
            search_func, details_func = funcs
            try:
                results = await search_func(search_title, author)
+                logger.info(f"{source_name}: {len(results)} Treffer")
+                local_best: MatchResult | None = None
+                local_score = 0.0
                for r in results:
                    score = _score_result(r, title, author)
-                    if score > best_score:
-                        best_score = score
-                        best = r
-                # Details holen wenn Treffer gut genug (z.B. MB Tracklist)
-                if best and best.source == source_name and best_score >= UNCERTAIN_THRESHOLD and details_func:
-                    try:
-                        details = await details_func(best.source_id)
-                        if details:
-                            details.confidence = best_score
-                            best = details
-                    except Exception as e:
-                        logger.warning(f"{source_name} Details Fehler: {e}")
+                    logger.info(f"  → {r.title!r} ({r.author!r}) score={score:.2f}")
+                    if score > local_score:
+                        local_score = score
+                        local_best = r
+                if local_best and local_score > best_score:
+                    best_score = local_score
+                    best = local_best
+                    if details_func and local_score >= UNCERTAIN_THRESHOLD:
+                        try:
+                            details = await details_func(local_best.source_id)
+                            if details:
+                                _enrich_match(best, details)
+                                logger.info(f"{source_name}: Details geladen für {local_best.source_id}")
+                        except Exception as e:
+                            logger.warning(f"{source_name} Details Fehler: {e}")
            except Exception as e:
                logger.warning(f"{source_name} Fehler: {e}")

@@ -248,18 +268,17 @@ async def match_audiobook(item_id: str):
            except Exception as e:
                logger.error(f"_apply_match fehlgeschlagen für '{title}': {e}", exc_info=True)
        else:
-            logger.info(f"Kein Match für '{title}' (beste Konfidenz: {best_score:.2f})")
+            logger.info(f"Kein Match für '{title}' (beste Konfidenz: {best_score:.2f}, Schwelle: {UNCERTAIN_THRESHOLD})")
            await db.commit()


 async def search_for_item(title: str, author: str | None = None) -> list[dict]:
-    """Suche über alle Quellen – für manuelles Matching."""
-    results = []
-
+    """Suche über alle Quellen – für manuelles Matching. Gibt alle relevanten Felder zurück."""
    async def _search_source(coro):
        try:
            return await coro
-        except Exception:
+        except Exception as e:
+            logger.warning(f"Such-Fehler: {e}")
            return []

    mb, ol, gb, dnb = await asyncio.gather(
@@ -269,16 +288,26 @@ async def search_for_item(title: str, author: str | None = None) -> list[dict]:
        _search_source(search_dnb(title, author)),
    )

+    results = []
    for r in mb + ol + gb + dnb:
        results.append({
            "source": r.source,
            "id": r.source_id,
            "title": r.title,
+            "subtitle": r.subtitle,
            "author": r.author,
+            "narrator": r.narrator,
+            "description": r.description,
+            "publisher": r.publisher,
            "publishYear": r.publish_year,
+            "series": r.series,
+            "seriesSequence": r.series_sequence,
+            "language": r.language,
+            "genres": r.genres,
            "cover": r.cover_url,
            "confidence": r.confidence,
        })

    results.sort(key=lambda x: x["confidence"], reverse=True)
+    logger.info(f"Manuelle Suche '{title}' (author={author!r}): {len(results)} Treffer total")
    return results