Make matching debuggable + fix metadata search blockers

DNB rewrite: - Multiple query strategies with fallback (title+author+mat=ton → title+author → title+mat=ton → title-only → fulltext). Returns on first hit. Most German audiobooks aren't tagged mat=ton in DNB, which was killing all searches. - Strip CQL wildcard chars (?, *, <, >, =, /, quotes) from search terms. The "???" in "Die drei ???" was breaking the CQL parser. - Log HTTP status, body snippet on non-200, and numberOfRecords on every query so log shows exactly what DNB returned. - Parse SRU diagnostic elements (DNB error messages buried in XML). - Convert author/narrator from "Lastname, Firstname" to "Firstname Lastname" for consistency with other sources. Matcher: - Split series patterns: WITH_EPISODE (need digit) and SERIES_ONLY (just the series name). "Die drei ??? und der Fluch des Rubins" now properly detects "Die drei ???" as series even without folge#. - New _build_search_title: removes ??? sequences, trailing parens, collapses whitespace, before sending to APIs. - Manual search also passes through normalization. Logs source + hit count per query. Debug endpoint: - GET /api/items/match/debug?title=...&author=... returns raw results from all 4 sources with status, error messages, and full metadata. - "Debug" button added in BookDetail — shows what each API actually returns inline, so the user can see if it's a search problem, parse problem, or threshold problem. - "Cover aus Datei" button — triggers local cover extraction (folder.jpg or embedded artwork) on demand. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-26 18:34:49 +02:00
parent 38f7c9726e
commit e3e6492b1f
5 changed files with 279 additions and 39 deletions
--- a/backend/app/services/matcher.py
+++ b/backend/app/services/matcher.py
@@ -31,7 +31,8 @@ logger = logging.getLogger(__name__)
 AUTO_ACCEPT_THRESHOLD = 0.65
 UNCERTAIN_THRESHOLD = 0.40

-SERIES_PATTERNS = [
+# Mit Folgenummer
+SERIES_PATTERNS_WITH_EPISODE = [
    (r"(?i)^(die drei \?\?\?|die drei fragezeichen|drei fragezeichen)\s*[-–]?\s*(?:folge\s*)?(\d+)", "Die drei ???"),
    (r"(?i)^(tkkg)\s*[-–]?\s*(?:folge\s*)?(\d+)", "TKKG"),
    (r"(?i)^(fünf freunde|funf freunde)\s*[-–]?\s*(?:band\s*)?(\d+)", "Fünf Freunde"),
@@ -43,17 +44,41 @@ SERIES_PATTERNS = [
    (r"(?i)^(.+?)\s*\((?:folge|band|teil|nr\.?|#|episode)\s*(\d+)\)", None),
 ]

+# Ohne Folgenummer (nur Serie erkennen)
+SERIES_PATTERNS_SERIES_ONLY = [
+    (r"(?i)^(die drei \?\?\?|die drei fragezeichen|drei fragezeichen)", "Die drei ???"),
+    (r"(?i)^(tkkg)\b", "TKKG"),
+    (r"(?i)^(fünf freunde|funf freunde)", "Fünf Freunde"),
+    (r"(?i)^(bibi blocksberg)", "Bibi Blocksberg"),
+    (r"(?i)^(benjamin blümchen|benjamin blumchen)", "Benjamin Blümchen"),
+    (r"(?i)^(bibi und tina)", "Bibi und Tina"),
+    (r"(?i)^(der kleine vampir)", "Der kleine Vampir"),
+]
+

 def detect_series(title: str) -> tuple[str | None, str | None]:
-    for pattern, canonical_name in SERIES_PATTERNS:
-        m = re.match(pattern, title.strip())
+    t = title.strip()
+    for pattern, canonical_name in SERIES_PATTERNS_WITH_EPISODE:
+        m = re.match(pattern, t)
        if m:
-            series = canonical_name or m.group(1).strip()
-            episode = m.group(2)
-            return series, episode
+            return (canonical_name or m.group(1).strip(), m.group(2))
+    for pattern, canonical_name in SERIES_PATTERNS_SERIES_ONLY:
+        m = re.match(pattern, t)
+        if m:
+            return (canonical_name or m.group(1).strip(), None)
    return None, None


+def _build_search_title(original: str) -> str:
+    """Bereinigt Titel für Such-APIs: ??? raus, Sonderzeichen, Klammer-Suffixe."""
+    t = original
+    t = re.sub(r"\?{2,}", "", t)
+    t = re.sub(r"\s*\([^)]*\)\s*$", "", t)
+    t = re.sub(r"[_\-–]+", " ", t)
+    t = re.sub(r"\s+", " ", t).strip()
+    return t
+
+
 def _title_similarity(a: str, b: str) -> float:
    """Wort-Überlapp mit Min/Max-Gewichtung — lenient für Teil-Treffer."""
    if not a or not b:
@@ -215,15 +240,20 @@ async def match_audiobook(item_id: str):
        author = item.author

        series, episode = detect_series(title)
-        search_title = title
        if series:
-            search_title = f"{series} {episode}" if episode else series
+            if episode:
+                search_title = f"{series} {episode}"
+            else:
+                # Serie erkannt, keine Folgennummer → kompletten Titel suchen
+                search_title = _build_search_title(title)
            if not item.series:
                item.series = series
            if not item.series_sequence and episode:
                item.series_sequence = episode
+        else:
+            search_title = _build_search_title(title)

-        logger.info(f"Matche: '{title}' (Such-Titel: '{search_title}') | Quellen: {sources}")
+        logger.info(f"Matche: orig='{title}' suchTitel='{search_title}' author={author!r} | Quellen: {sources}")

        best: MatchResult | None = None
        best_score = 0.0
@@ -274,18 +304,23 @@ async def match_audiobook(item_id: str):

 async def search_for_item(title: str, author: str | None = None) -> list[dict]:
    """Suche über alle Quellen – für manuelles Matching. Gibt alle relevanten Felder zurück."""
-    async def _search_source(coro):
+    search_title = _build_search_title(title)
+    logger.info(f"Manuelle Suche: orig='{title}' bereinigt='{search_title}' author={author!r}")
+
+    async def _search_source(name: str, coro):
        try:
-            return await coro
+            r = await coro
+            logger.info(f"Manuelle Suche {name}: {len(r)} Treffer")
+            return r
        except Exception as e:
-            logger.warning(f"Such-Fehler: {e}")
+            logger.warning(f"Manuelle Suche {name} Fehler: {e}")
            return []

    mb, ol, gb, dnb = await asyncio.gather(
-        _search_source(search_musicbrainz(title, author)),
-        _search_source(search_open_library(title, author)),
-        _search_source(search_google_books(title, author)),
-        _search_source(search_dnb(title, author)),
+        _search_source("musicbrainz", search_musicbrainz(search_title, author)),
+        _search_source("open_library", search_open_library(search_title, author)),
+        _search_source("google_books", search_google_books(search_title, author)),
+        _search_source("dnb", search_dnb(search_title, author)),
    )

    results = []