Make matching debuggable + fix metadata search blockers

DNB rewrite:
- Multiple query strategies with fallback (title+author+mat=ton →
  title+author → title+mat=ton → title-only → fulltext). Returns on
  first hit. Most German audiobooks aren't tagged mat=ton in DNB,
  which was killing all searches.
- Strip CQL wildcard chars (?, *, <, >, =, /, quotes) from search
  terms. The "???" in "Die drei ???" was breaking the CQL parser.
- Log HTTP status, body snippet on non-200, and numberOfRecords on
  every query so log shows exactly what DNB returned.
- Parse SRU diagnostic elements (DNB error messages buried in XML).
- Convert author/narrator from "Lastname, Firstname" to
  "Firstname Lastname" for consistency with other sources.

Matcher:
- Split series patterns: WITH_EPISODE (need digit) and SERIES_ONLY
  (just the series name). "Die drei ??? und der Fluch des Rubins"
  now properly detects "Die drei ???" as series even without folge#.
- New _build_search_title: removes ??? sequences, trailing parens,
  collapses whitespace, before sending to APIs.
- Manual search also passes through normalization. Logs source +
  hit count per query.

Debug endpoint:
- GET /api/items/match/debug?title=...&author=... returns raw results
  from all 4 sources with status, error messages, and full metadata.
- "Debug" button added in BookDetail — shows what each API actually
  returns inline, so the user can see if it's a search problem,
  parse problem, or threshold problem.
- "Cover aus Datei" button — triggers local cover extraction
  (folder.jpg or embedded artwork) on demand.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Audiolib
2026-05-26 18:34:49 +02:00
parent 38f7c9726e
commit e3e6492b1f
5 changed files with 279 additions and 39 deletions

View File

@@ -52,6 +52,63 @@ async def search_match(
return {"results": results}
@router.get("/match/debug")
async def debug_match(
title: str,
author: str | None = None,
current_user: User = Depends(get_current_user),
):
"""Debug-Endpoint: gibt rohe Ergebnisse aller Such-Quellen zurück.
Aufruf direkt aus Browser: /api/items/match/debug?title=Foo&author=Bar
"""
from ..services.matching.musicbrainz import search_musicbrainz
from ..services.matching.open_library import search_open_library
from ..services.matching.google_books import search_google_books
from ..services.matching.dnb import search_dnb
from ..services.matcher import _build_search_title, detect_series
series, episode = detect_series(title)
search_title = _build_search_title(title)
if series and episode:
search_title = f"{series} {episode}"
logger.info(f"DEBUG: title={title!r} → search={search_title!r} series={series!r} episode={episode!r}")
async def _try(name, coro):
try:
r = await coro
return {
"source": name,
"ok": True,
"count": len(r),
"results": [
{
"title": x.title, "author": x.author, "narrator": x.narrator,
"publisher": x.publisher, "year": x.publish_year,
"series": x.series, "series_sequence": x.series_sequence,
"cover_url": x.cover_url, "language": x.language,
"genres": x.genres, "description": (x.description or "")[:200],
"confidence": x.confidence, "source_id": x.source_id,
} for x in r
],
}
except Exception as e:
return {"source": name, "ok": False, "error": f"{type(e).__name__}: {e}"}
results = await asyncio.gather(
_try("musicbrainz", search_musicbrainz(search_title, author)),
_try("open_library", search_open_library(search_title, author)),
_try("google_books", search_google_books(search_title, author)),
_try("dnb", search_dnb(search_title, author)),
)
return {
"input": {"title": title, "author": author},
"normalized": {"search_title": search_title, "series": series, "episode": episode},
"sources": results,
}
@router.post("/{item_id}/match/apply")
async def apply_match(
item_id: str,

View File

@@ -31,7 +31,8 @@ logger = logging.getLogger(__name__)
AUTO_ACCEPT_THRESHOLD = 0.65
UNCERTAIN_THRESHOLD = 0.40
SERIES_PATTERNS = [
# Mit Folgenummer
SERIES_PATTERNS_WITH_EPISODE = [
(r"(?i)^(die drei \?\?\?|die drei fragezeichen|drei fragezeichen)\s*[-]?\s*(?:folge\s*)?(\d+)", "Die drei ???"),
(r"(?i)^(tkkg)\s*[-]?\s*(?:folge\s*)?(\d+)", "TKKG"),
(r"(?i)^(fünf freunde|funf freunde)\s*[-]?\s*(?:band\s*)?(\d+)", "Fünf Freunde"),
@@ -43,17 +44,41 @@ SERIES_PATTERNS = [
(r"(?i)^(.+?)\s*\((?:folge|band|teil|nr\.?|#|episode)\s*(\d+)\)", None),
]
# Ohne Folgenummer (nur Serie erkennen)
SERIES_PATTERNS_SERIES_ONLY = [
(r"(?i)^(die drei \?\?\?|die drei fragezeichen|drei fragezeichen)", "Die drei ???"),
(r"(?i)^(tkkg)\b", "TKKG"),
(r"(?i)^(fünf freunde|funf freunde)", "Fünf Freunde"),
(r"(?i)^(bibi blocksberg)", "Bibi Blocksberg"),
(r"(?i)^(benjamin blümchen|benjamin blumchen)", "Benjamin Blümchen"),
(r"(?i)^(bibi und tina)", "Bibi und Tina"),
(r"(?i)^(der kleine vampir)", "Der kleine Vampir"),
]
def detect_series(title: str) -> tuple[str | None, str | None]:
for pattern, canonical_name in SERIES_PATTERNS:
m = re.match(pattern, title.strip())
t = title.strip()
for pattern, canonical_name in SERIES_PATTERNS_WITH_EPISODE:
m = re.match(pattern, t)
if m:
series = canonical_name or m.group(1).strip()
episode = m.group(2)
return series, episode
return (canonical_name or m.group(1).strip(), m.group(2))
for pattern, canonical_name in SERIES_PATTERNS_SERIES_ONLY:
m = re.match(pattern, t)
if m:
return (canonical_name or m.group(1).strip(), None)
return None, None
def _build_search_title(original: str) -> str:
"""Bereinigt Titel für Such-APIs: ??? raus, Sonderzeichen, Klammer-Suffixe."""
t = original
t = re.sub(r"\?{2,}", "", t)
t = re.sub(r"\s*\([^)]*\)\s*$", "", t)
t = re.sub(r"[_\-]+", " ", t)
t = re.sub(r"\s+", " ", t).strip()
return t
def _title_similarity(a: str, b: str) -> float:
"""Wort-Überlapp mit Min/Max-Gewichtung — lenient für Teil-Treffer."""
if not a or not b:
@@ -215,15 +240,20 @@ async def match_audiobook(item_id: str):
author = item.author
series, episode = detect_series(title)
search_title = title
if series:
search_title = f"{series} {episode}" if episode else series
if episode:
search_title = f"{series} {episode}"
else:
# Serie erkannt, keine Folgennummer → kompletten Titel suchen
search_title = _build_search_title(title)
if not item.series:
item.series = series
if not item.series_sequence and episode:
item.series_sequence = episode
else:
search_title = _build_search_title(title)
logger.info(f"Matche: '{title}' (Such-Titel: '{search_title}') | Quellen: {sources}")
logger.info(f"Matche: orig='{title}' suchTitel='{search_title}' author={author!r} | Quellen: {sources}")
best: MatchResult | None = None
best_score = 0.0
@@ -274,18 +304,23 @@ async def match_audiobook(item_id: str):
async def search_for_item(title: str, author: str | None = None) -> list[dict]:
"""Suche über alle Quellen für manuelles Matching. Gibt alle relevanten Felder zurück."""
async def _search_source(coro):
search_title = _build_search_title(title)
logger.info(f"Manuelle Suche: orig='{title}' bereinigt='{search_title}' author={author!r}")
async def _search_source(name: str, coro):
try:
return await coro
r = await coro
logger.info(f"Manuelle Suche {name}: {len(r)} Treffer")
return r
except Exception as e:
logger.warning(f"Such-Fehler: {e}")
logger.warning(f"Manuelle Suche {name} Fehler: {e}")
return []
mb, ol, gb, dnb = await asyncio.gather(
_search_source(search_musicbrainz(title, author)),
_search_source(search_open_library(title, author)),
_search_source(search_google_books(title, author)),
_search_source(search_dnb(title, author)),
_search_source("musicbrainz", search_musicbrainz(search_title, author)),
_search_source("open_library", search_open_library(search_title, author)),
_search_source("google_books", search_google_books(search_title, author)),
_search_source("dnb", search_dnb(search_title, author)),
)
results = []

View File

@@ -1,24 +1,67 @@
"""
Deutsche Nationalbibliothek (DNB) SRU-Schnittstelle.
Sucht Hörbücher (mat=ton) über MARC21-XML.
Mehrere Query-Strategien mit Fallback; ausführliches Logging.
"""
import re
import logging
import httpx
from xml.etree import ElementTree as ET
from .base import MatchResult
logger = logging.getLogger(__name__)
DNB_SRU = "https://services.dnb.de/sru/dnb"
HEADERS = {"User-Agent": "audiolib/1.0 (contact@audiolib.local)"}
_NS_SRW = "http://www.loc.gov/zing/srw/"
_NS_MARC = "http://www.loc.gov/MARC21/slim"
# CQL Wildcards / Sonderzeichen die wir aus Such-Titeln entfernen
_CQL_STRIP = re.compile(r"[?*<>=/\"']")
_WHITESPACE = re.compile(r"\s+")
def _norm_for_query(text: str) -> str:
"""Entfernt CQL-Sonderzeichen und Doppelspaces."""
out = _CQL_STRIP.sub(" ", text)
out = _WHITESPACE.sub(" ", out).strip()
return out
async def search_dnb(title: str, author: str | None = None) -> list[MatchResult]:
parts = [f'tit="{title}"', "mat=ton"]
if author:
parts.append(f'per="{author}"')
query = " AND ".join(parts)
"""Mehrere Query-Strategien, gibt beim ersten Erfolg zurück."""
norm_title = _norm_for_query(title)
norm_author = _norm_for_query(author) if author else None
if not norm_title:
logger.info("DNB: leerer Titel nach Normalisierung")
return []
queries: list[str] = []
# 1) Titel + Autor (mit Hörbuch-Filter)
if norm_author:
queries.append(f'tit="{norm_title}" AND per="{norm_author}" AND mat=ton')
queries.append(f'tit="{norm_title}" AND per="{norm_author}"')
# 2) Nur Titel (mit Hörbuch-Filter)
queries.append(f'tit="{norm_title}" AND mat=ton')
# 3) Nur Titel ohne Filter
queries.append(f'tit="{norm_title}"')
# 4) Volltext-Fallback
if norm_author:
queries.append(f'{norm_title} {norm_author}')
else:
queries.append(norm_title)
async with httpx.AsyncClient(headers=HEADERS, timeout=20) as client:
for query in queries:
results = await _dnb_query(client, query)
if results:
logger.info(f"DNB: '{query}'{len(results)} Treffer")
return results
logger.info(f"DNB: '{query}' → 0 Treffer")
return []
async def _dnb_query(client: httpx.AsyncClient, query: str) -> list[MatchResult]:
params = {
"version": "1.1",
"operation": "searchRetrieve",
@@ -26,18 +69,38 @@ async def search_dnb(title: str, author: str | None = None) -> list[MatchResult]
"recordSchema": "MARC21-xml",
"maximumRecords": "5",
}
async with httpx.AsyncClient(headers=HEADERS, timeout=15) as client:
try:
r = await client.get(DNB_SRU, params=params)
r.raise_for_status()
except Exception:
return []
try:
r = await client.get(DNB_SRU, params=params)
except Exception as e:
logger.warning(f"DNB HTTP-Fehler ({query!r}): {e}")
return []
if r.status_code != 200:
snippet = r.text[:200] if r.text else ""
logger.warning(f"DNB HTTP {r.status_code} für {query!r}: {snippet}")
return []
try:
root = ET.fromstring(r.text)
except ET.ParseError:
except ET.ParseError as e:
logger.warning(f"DNB XML-Parse-Fehler: {e} — Body: {r.text[:200]}")
return []
# numberOfRecords prüfen
num_elem = root.find(f".//{{{_NS_SRW}}}numberOfRecords")
num = 0
if num_elem is not None and num_elem.text:
try:
num = int(num_elem.text)
except ValueError:
pass
# Diagnose-Fehler aus DNB
diag = root.find(f".//{{http://www.loc.gov/zing/srw/diagnostic/}}diagnostic")
if diag is not None:
diag_msg = "".join(diag.itertext()).strip()
logger.warning(f"DNB Diagnose: {diag_msg}")
results = []
for record in root.findall(f".//{{{_NS_SRW}}}record"):
marc = record.find(f".//{{{_NS_MARC}}}record")
@@ -47,8 +110,8 @@ async def search_dnb(title: str, author: str | None = None) -> list[MatchResult]
result = _parse_marc(marc)
if result:
results.append(result)
except Exception:
continue
except Exception as e:
logger.warning(f"DNB MARC-Parse-Fehler: {e}")
return results
@@ -86,23 +149,31 @@ def _parse_marc(marc) -> MatchResult | None:
author = _field(marc, "100", "a")
if author:
author = author.rstrip(",").strip()
# DNB-Format "Nachname, Vorname" → "Vorname Nachname"
if "," in author:
parts = [p.strip() for p in author.split(",", 1)]
if len(parts) == 2:
author = f"{parts[1]} {parts[0]}"
# Sprecher aus 700 $e = "Sprecher" oder $4 = "spk"
narrator = None
for f in marc.findall(f"{{{_NS_MARC}}}datafield[@tag='700']"):
e_sf = f.find(f"{{{_NS_MARC}}}subfield[@code='e']")
r_sf = f.find(f"{{{_NS_MARC}}}subfield[@code='4']")
is_narrator = (
(e_sf is not None and e_sf.text and "prech" in e_sf.text.lower())
or (r_sf is not None and r_sf.text == "spk")
(e_sf is not None and e_sf.text and ("prech" in e_sf.text.lower() or "erzähl" in e_sf.text.lower()))
or (r_sf is not None and r_sf.text in ("spk", "nrt"))
)
if is_narrator:
n_sf = f.find(f"{{{_NS_MARC}}}subfield[@code='a']")
if n_sf is not None and n_sf.text:
narrator = n_sf.text.rstrip(",").strip()
if "," in narrator:
parts = [p.strip() for p in narrator.split(",", 1)]
if len(parts) == 2:
narrator = f"{parts[1]} {parts[0]}"
break
publisher = (_field(marc, "264", "b") or "").rstrip(",").strip() or None
publisher = (_field(marc, "264", "b") or _field(marc, "260", "b") or "").rstrip(",").strip() or None
year_raw = _field(marc, "264", "c") or _field(marc, "260", "c")
publish_year = None
if year_raw:
@@ -114,14 +185,13 @@ def _parse_marc(marc) -> MatchResult | None:
language = _field(marc, "041", "a")
genres = _fields(marc, "650", "a")[:5]
series = _field(marc, "830", "a") or _field(marc, "800", "t")
series_seq = _field(marc, "830", "v") or _field(marc, "800", "v")
series = _field(marc, "830", "a") or _field(marc, "800", "t") or _field(marc, "490", "a")
series_seq = _field(marc, "830", "v") or _field(marc, "800", "v") or _field(marc, "490", "v")
# DNB-ID aus Kontrollfeld 001
ctrl = marc.find(f"{{{_NS_MARC}}}controlfield[@tag='001']")
dnb_id = ctrl.text.strip() if ctrl is not None and ctrl.text else None
# ISBN für Cover
# ISBN für Cover (sehr unzuverlässig bei DNB-Hörbüchern)
isbn_raw = _field(marc, "020", "a") or ""
isbn = re.sub(r"[^0-9X]", "", isbn_raw.split()[0]) if isbn_raw else None
cover_url = f"https://portal.dnb.de/opac/mvb/cover?isbn={isbn}" if isbn else None