Make matching debuggable + fix metadata search blockers

DNB rewrite: - Multiple query strategies with fallback (title+author+mat=ton → title+author → title+mat=ton → title-only → fulltext). Returns on first hit. Most German audiobooks aren't tagged mat=ton in DNB, which was killing all searches. - Strip CQL wildcard chars (?, *, <, >, =, /, quotes) from search terms. The "???" in "Die drei ???" was breaking the CQL parser. - Log HTTP status, body snippet on non-200, and numberOfRecords on every query so log shows exactly what DNB returned. - Parse SRU diagnostic elements (DNB error messages buried in XML). - Convert author/narrator from "Lastname, Firstname" to "Firstname Lastname" for consistency with other sources. Matcher: - Split series patterns: WITH_EPISODE (need digit) and SERIES_ONLY (just the series name). "Die drei ??? und der Fluch des Rubins" now properly detects "Die drei ???" as series even without folge#. - New _build_search_title: removes ??? sequences, trailing parens, collapses whitespace, before sending to APIs. - Manual search also passes through normalization. Logs source + hit count per query. Debug endpoint: - GET /api/items/match/debug?title=...&author=... returns raw results from all 4 sources with status, error messages, and full metadata. - "Debug" button added in BookDetail — shows what each API actually returns inline, so the user can see if it's a search problem, parse problem, or threshold problem. - "Cover aus Datei" button — triggers local cover extraction (folder.jpg or embedded artwork) on demand. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-26 18:34:49 +02:00
parent 38f7c9726e
commit e3e6492b1f
5 changed files with 279 additions and 39 deletions
--- a/backend/app/routers/matching.py
+++ b/backend/app/routers/matching.py
@@ -52,6 +52,63 @@ async def search_match(
    return {"results": results}


+@router.get("/match/debug")
+async def debug_match(
+    title: str,
+    author: str | None = None,
+    current_user: User = Depends(get_current_user),
+):
+    """Debug-Endpoint: gibt rohe Ergebnisse aller Such-Quellen zurück.
+    Aufruf direkt aus Browser: /api/items/match/debug?title=Foo&author=Bar
+    """
+    from ..services.matching.musicbrainz import search_musicbrainz
+    from ..services.matching.open_library import search_open_library
+    from ..services.matching.google_books import search_google_books
+    from ..services.matching.dnb import search_dnb
+    from ..services.matcher import _build_search_title, detect_series
+
+    series, episode = detect_series(title)
+    search_title = _build_search_title(title)
+    if series and episode:
+        search_title = f"{series} {episode}"
+
+    logger.info(f"DEBUG: title={title!r} → search={search_title!r} series={series!r} episode={episode!r}")
+
+    async def _try(name, coro):
+        try:
+            r = await coro
+            return {
+                "source": name,
+                "ok": True,
+                "count": len(r),
+                "results": [
+                    {
+                        "title": x.title, "author": x.author, "narrator": x.narrator,
+                        "publisher": x.publisher, "year": x.publish_year,
+                        "series": x.series, "series_sequence": x.series_sequence,
+                        "cover_url": x.cover_url, "language": x.language,
+                        "genres": x.genres, "description": (x.description or "")[:200],
+                        "confidence": x.confidence, "source_id": x.source_id,
+                    } for x in r
+                ],
+            }
+        except Exception as e:
+            return {"source": name, "ok": False, "error": f"{type(e).__name__}: {e}"}
+
+    results = await asyncio.gather(
+        _try("musicbrainz", search_musicbrainz(search_title, author)),
+        _try("open_library", search_open_library(search_title, author)),
+        _try("google_books", search_google_books(search_title, author)),
+        _try("dnb", search_dnb(search_title, author)),
+    )
+
+    return {
+        "input": {"title": title, "author": author},
+        "normalized": {"search_title": search_title, "series": series, "episode": episode},
+        "sources": results,
+    }
+
+
@router.post("/{item_id}/match/apply")
 async def apply_match(
    item_id: str,
--- a/backend/app/services/matcher.py
+++ b/backend/app/services/matcher.py
@@ -31,7 +31,8 @@ logger = logging.getLogger(__name__)
 AUTO_ACCEPT_THRESHOLD = 0.65
 UNCERTAIN_THRESHOLD = 0.40

-SERIES_PATTERNS = [
+# Mit Folgenummer
+SERIES_PATTERNS_WITH_EPISODE = [
    (r"(?i)^(die drei \?\?\?|die drei fragezeichen|drei fragezeichen)\s*[-–]?\s*(?:folge\s*)?(\d+)", "Die drei ???"),
    (r"(?i)^(tkkg)\s*[-–]?\s*(?:folge\s*)?(\d+)", "TKKG"),
    (r"(?i)^(fünf freunde|funf freunde)\s*[-–]?\s*(?:band\s*)?(\d+)", "Fünf Freunde"),
@@ -43,17 +44,41 @@ SERIES_PATTERNS = [
    (r"(?i)^(.+?)\s*\((?:folge|band|teil|nr\.?|#|episode)\s*(\d+)\)", None),
 ]

+# Ohne Folgenummer (nur Serie erkennen)
+SERIES_PATTERNS_SERIES_ONLY = [
+    (r"(?i)^(die drei \?\?\?|die drei fragezeichen|drei fragezeichen)", "Die drei ???"),
+    (r"(?i)^(tkkg)\b", "TKKG"),
+    (r"(?i)^(fünf freunde|funf freunde)", "Fünf Freunde"),
+    (r"(?i)^(bibi blocksberg)", "Bibi Blocksberg"),
+    (r"(?i)^(benjamin blümchen|benjamin blumchen)", "Benjamin Blümchen"),
+    (r"(?i)^(bibi und tina)", "Bibi und Tina"),
+    (r"(?i)^(der kleine vampir)", "Der kleine Vampir"),
+]
+

 def detect_series(title: str) -> tuple[str | None, str | None]:
-    for pattern, canonical_name in SERIES_PATTERNS:
-        m = re.match(pattern, title.strip())
+    t = title.strip()
+    for pattern, canonical_name in SERIES_PATTERNS_WITH_EPISODE:
+        m = re.match(pattern, t)
        if m:
-            series = canonical_name or m.group(1).strip()
-            episode = m.group(2)
-            return series, episode
+            return (canonical_name or m.group(1).strip(), m.group(2))
+    for pattern, canonical_name in SERIES_PATTERNS_SERIES_ONLY:
+        m = re.match(pattern, t)
+        if m:
+            return (canonical_name or m.group(1).strip(), None)
    return None, None


+def _build_search_title(original: str) -> str:
+    """Bereinigt Titel für Such-APIs: ??? raus, Sonderzeichen, Klammer-Suffixe."""
+    t = original
+    t = re.sub(r"\?{2,}", "", t)
+    t = re.sub(r"\s*\([^)]*\)\s*$", "", t)
+    t = re.sub(r"[_\-–]+", " ", t)
+    t = re.sub(r"\s+", " ", t).strip()
+    return t
+
+
 def _title_similarity(a: str, b: str) -> float:
    """Wort-Überlapp mit Min/Max-Gewichtung — lenient für Teil-Treffer."""
    if not a or not b:
@@ -215,15 +240,20 @@ async def match_audiobook(item_id: str):
        author = item.author

        series, episode = detect_series(title)
-        search_title = title
        if series:
-            search_title = f"{series} {episode}" if episode else series
+            if episode:
+                search_title = f"{series} {episode}"
+            else:
+                # Serie erkannt, keine Folgennummer → kompletten Titel suchen
+                search_title = _build_search_title(title)
            if not item.series:
                item.series = series
            if not item.series_sequence and episode:
                item.series_sequence = episode
+        else:
+            search_title = _build_search_title(title)

-        logger.info(f"Matche: '{title}' (Such-Titel: '{search_title}') | Quellen: {sources}")
+        logger.info(f"Matche: orig='{title}' suchTitel='{search_title}' author={author!r} | Quellen: {sources}")

        best: MatchResult | None = None
        best_score = 0.0
@@ -274,18 +304,23 @@ async def match_audiobook(item_id: str):

 async def search_for_item(title: str, author: str | None = None) -> list[dict]:
    """Suche über alle Quellen – für manuelles Matching. Gibt alle relevanten Felder zurück."""
-    async def _search_source(coro):
+    search_title = _build_search_title(title)
+    logger.info(f"Manuelle Suche: orig='{title}' bereinigt='{search_title}' author={author!r}")
+
+    async def _search_source(name: str, coro):
        try:
-            return await coro
+            r = await coro
+            logger.info(f"Manuelle Suche {name}: {len(r)} Treffer")
+            return r
        except Exception as e:
-            logger.warning(f"Such-Fehler: {e}")
+            logger.warning(f"Manuelle Suche {name} Fehler: {e}")
            return []

    mb, ol, gb, dnb = await asyncio.gather(
-        _search_source(search_musicbrainz(title, author)),
-        _search_source(search_open_library(title, author)),
-        _search_source(search_google_books(title, author)),
-        _search_source(search_dnb(title, author)),
+        _search_source("musicbrainz", search_musicbrainz(search_title, author)),
+        _search_source("open_library", search_open_library(search_title, author)),
+        _search_source("google_books", search_google_books(search_title, author)),
+        _search_source("dnb", search_dnb(search_title, author)),
    )

    results = []
--- a/backend/app/services/matching/dnb.py
+++ b/backend/app/services/matching/dnb.py
@@ -1,24 +1,67 @@
 """
 Deutsche Nationalbibliothek (DNB) SRU-Schnittstelle.
-Sucht Hörbücher (mat=ton) über MARC21-XML.
+Mehrere Query-Strategien mit Fallback; ausführliches Logging.
 """
 import re
+import logging
 import httpx
 from xml.etree import ElementTree as ET
 from .base import MatchResult

+logger = logging.getLogger(__name__)
+
 DNB_SRU = "https://services.dnb.de/sru/dnb"
 HEADERS = {"User-Agent": "audiolib/1.0 (contact@audiolib.local)"}
 _NS_SRW = "http://www.loc.gov/zing/srw/"
 _NS_MARC = "http://www.loc.gov/MARC21/slim"

+# CQL Wildcards / Sonderzeichen die wir aus Such-Titeln entfernen
+_CQL_STRIP = re.compile(r"[?*<>=/\"']")
+_WHITESPACE = re.compile(r"\s+")
+
+
+def _norm_for_query(text: str) -> str:
+    """Entfernt CQL-Sonderzeichen und Doppelspaces."""
+    out = _CQL_STRIP.sub(" ", text)
+    out = _WHITESPACE.sub(" ", out).strip()
+    return out
+

 async def search_dnb(title: str, author: str | None = None) -> list[MatchResult]:
-    parts = [f'tit="{title}"', "mat=ton"]
-    if author:
-        parts.append(f'per="{author}"')
-    query = " AND ".join(parts)
+    """Mehrere Query-Strategien, gibt beim ersten Erfolg zurück."""
+    norm_title = _norm_for_query(title)
+    norm_author = _norm_for_query(author) if author else None

+    if not norm_title:
+        logger.info("DNB: leerer Titel nach Normalisierung")
+        return []
+
+    queries: list[str] = []
+    # 1) Titel + Autor (mit Hörbuch-Filter)
+    if norm_author:
+        queries.append(f'tit="{norm_title}" AND per="{norm_author}" AND mat=ton')
+        queries.append(f'tit="{norm_title}" AND per="{norm_author}"')
+    # 2) Nur Titel (mit Hörbuch-Filter)
+    queries.append(f'tit="{norm_title}" AND mat=ton')
+    # 3) Nur Titel ohne Filter
+    queries.append(f'tit="{norm_title}"')
+    # 4) Volltext-Fallback
+    if norm_author:
+        queries.append(f'{norm_title} {norm_author}')
+    else:
+        queries.append(norm_title)
+
+    async with httpx.AsyncClient(headers=HEADERS, timeout=20) as client:
+        for query in queries:
+            results = await _dnb_query(client, query)
+            if results:
+                logger.info(f"DNB: '{query}' → {len(results)} Treffer")
+                return results
+            logger.info(f"DNB: '{query}' → 0 Treffer")
+    return []
+
+
+async def _dnb_query(client: httpx.AsyncClient, query: str) -> list[MatchResult]:
    params = {
        "version": "1.1",
        "operation": "searchRetrieve",
@@ -26,18 +69,38 @@ async def search_dnb(title: str, author: str | None = None) -> list[MatchResult]
        "recordSchema": "MARC21-xml",
        "maximumRecords": "5",
    }
-    async with httpx.AsyncClient(headers=HEADERS, timeout=15) as client:
-        try:
-            r = await client.get(DNB_SRU, params=params)
-            r.raise_for_status()
-        except Exception:
-            return []
+    try:
+        r = await client.get(DNB_SRU, params=params)
+    except Exception as e:
+        logger.warning(f"DNB HTTP-Fehler ({query!r}): {e}")
+        return []
+
+    if r.status_code != 200:
+        snippet = r.text[:200] if r.text else ""
+        logger.warning(f"DNB HTTP {r.status_code} für {query!r}: {snippet}")
+        return []

    try:
        root = ET.fromstring(r.text)
-    except ET.ParseError:
+    except ET.ParseError as e:
+        logger.warning(f"DNB XML-Parse-Fehler: {e} — Body: {r.text[:200]}")
        return []

+    # numberOfRecords prüfen
+    num_elem = root.find(f".//{{{_NS_SRW}}}numberOfRecords")
+    num = 0
+    if num_elem is not None and num_elem.text:
+        try:
+            num = int(num_elem.text)
+        except ValueError:
+            pass
+
+    # Diagnose-Fehler aus DNB
+    diag = root.find(f".//{{http://www.loc.gov/zing/srw/diagnostic/}}diagnostic")
+    if diag is not None:
+        diag_msg = "".join(diag.itertext()).strip()
+        logger.warning(f"DNB Diagnose: {diag_msg}")
+
    results = []
    for record in root.findall(f".//{{{_NS_SRW}}}record"):
        marc = record.find(f".//{{{_NS_MARC}}}record")
@@ -47,8 +110,8 @@ async def search_dnb(title: str, author: str | None = None) -> list[MatchResult]
            result = _parse_marc(marc)
            if result:
                results.append(result)
-        except Exception:
-            continue
+        except Exception as e:
+            logger.warning(f"DNB MARC-Parse-Fehler: {e}")
    return results


@@ -86,23 +149,31 @@ def _parse_marc(marc) -> MatchResult | None:
    author = _field(marc, "100", "a")
    if author:
        author = author.rstrip(",").strip()
+        # DNB-Format "Nachname, Vorname" → "Vorname Nachname"
+        if "," in author:
+            parts = [p.strip() for p in author.split(",", 1)]
+            if len(parts) == 2:
+                author = f"{parts[1]} {parts[0]}"

-    # Sprecher aus 700 $e = "Sprecher" oder $4 = "spk"
    narrator = None
    for f in marc.findall(f"{{{_NS_MARC}}}datafield[@tag='700']"):
        e_sf = f.find(f"{{{_NS_MARC}}}subfield[@code='e']")
        r_sf = f.find(f"{{{_NS_MARC}}}subfield[@code='4']")
        is_narrator = (
-            (e_sf is not None and e_sf.text and "prech" in e_sf.text.lower())
-            or (r_sf is not None and r_sf.text == "spk")
+            (e_sf is not None and e_sf.text and ("prech" in e_sf.text.lower() or "erzähl" in e_sf.text.lower()))
+            or (r_sf is not None and r_sf.text in ("spk", "nrt"))
        )
        if is_narrator:
            n_sf = f.find(f"{{{_NS_MARC}}}subfield[@code='a']")
            if n_sf is not None and n_sf.text:
                narrator = n_sf.text.rstrip(",").strip()
+                if "," in narrator:
+                    parts = [p.strip() for p in narrator.split(",", 1)]
+                    if len(parts) == 2:
+                        narrator = f"{parts[1]} {parts[0]}"
                break

-    publisher = (_field(marc, "264", "b") or "").rstrip(",").strip() or None
+    publisher = (_field(marc, "264", "b") or _field(marc, "260", "b") or "").rstrip(",").strip() or None
    year_raw = _field(marc, "264", "c") or _field(marc, "260", "c")
    publish_year = None
    if year_raw:
@@ -114,14 +185,13 @@ def _parse_marc(marc) -> MatchResult | None:
    language = _field(marc, "041", "a")
    genres = _fields(marc, "650", "a")[:5]

-    series = _field(marc, "830", "a") or _field(marc, "800", "t")
-    series_seq = _field(marc, "830", "v") or _field(marc, "800", "v")
+    series = _field(marc, "830", "a") or _field(marc, "800", "t") or _field(marc, "490", "a")
+    series_seq = _field(marc, "830", "v") or _field(marc, "800", "v") or _field(marc, "490", "v")

-    # DNB-ID aus Kontrollfeld 001
    ctrl = marc.find(f"{{{_NS_MARC}}}controlfield[@tag='001']")
    dnb_id = ctrl.text.strip() if ctrl is not None and ctrl.text else None

-    # ISBN für Cover
+    # ISBN für Cover (sehr unzuverlässig bei DNB-Hörbüchern)
    isbn_raw = _field(marc, "020", "a") or ""
    isbn = re.sub(r"[^0-9X]", "", isbn_raw.split()[0]) if isbn_raw else None
    cover_url = f"https://portal.dnb.de/opac/mvb/cover?isbn={isbn}" if isbn else None