diff --git a/backend/app/routers/matching.py b/backend/app/routers/matching.py index cceec42..a81cc24 100644 --- a/backend/app/routers/matching.py +++ b/backend/app/routers/matching.py @@ -52,6 +52,63 @@ async def search_match( return {"results": results} +@router.get("/match/debug") +async def debug_match( + title: str, + author: str | None = None, + current_user: User = Depends(get_current_user), +): + """Debug-Endpoint: gibt rohe Ergebnisse aller Such-Quellen zurück. + Aufruf direkt aus Browser: /api/items/match/debug?title=Foo&author=Bar + """ + from ..services.matching.musicbrainz import search_musicbrainz + from ..services.matching.open_library import search_open_library + from ..services.matching.google_books import search_google_books + from ..services.matching.dnb import search_dnb + from ..services.matcher import _build_search_title, detect_series + + series, episode = detect_series(title) + search_title = _build_search_title(title) + if series and episode: + search_title = f"{series} {episode}" + + logger.info(f"DEBUG: title={title!r} → search={search_title!r} series={series!r} episode={episode!r}") + + async def _try(name, coro): + try: + r = await coro + return { + "source": name, + "ok": True, + "count": len(r), + "results": [ + { + "title": x.title, "author": x.author, "narrator": x.narrator, + "publisher": x.publisher, "year": x.publish_year, + "series": x.series, "series_sequence": x.series_sequence, + "cover_url": x.cover_url, "language": x.language, + "genres": x.genres, "description": (x.description or "")[:200], + "confidence": x.confidence, "source_id": x.source_id, + } for x in r + ], + } + except Exception as e: + return {"source": name, "ok": False, "error": f"{type(e).__name__}: {e}"} + + results = await asyncio.gather( + _try("musicbrainz", search_musicbrainz(search_title, author)), + _try("open_library", search_open_library(search_title, author)), + _try("google_books", search_google_books(search_title, author)), + _try("dnb", search_dnb(search_title, author)), + ) + + return { + "input": {"title": title, "author": author}, + "normalized": {"search_title": search_title, "series": series, "episode": episode}, + "sources": results, + } + + @router.post("/{item_id}/match/apply") async def apply_match( item_id: str, diff --git a/backend/app/services/matcher.py b/backend/app/services/matcher.py index cf87413..81711e7 100644 --- a/backend/app/services/matcher.py +++ b/backend/app/services/matcher.py @@ -31,7 +31,8 @@ logger = logging.getLogger(__name__) AUTO_ACCEPT_THRESHOLD = 0.65 UNCERTAIN_THRESHOLD = 0.40 -SERIES_PATTERNS = [ +# Mit Folgenummer +SERIES_PATTERNS_WITH_EPISODE = [ (r"(?i)^(die drei \?\?\?|die drei fragezeichen|drei fragezeichen)\s*[-–]?\s*(?:folge\s*)?(\d+)", "Die drei ???"), (r"(?i)^(tkkg)\s*[-–]?\s*(?:folge\s*)?(\d+)", "TKKG"), (r"(?i)^(fünf freunde|funf freunde)\s*[-–]?\s*(?:band\s*)?(\d+)", "Fünf Freunde"), @@ -43,17 +44,41 @@ SERIES_PATTERNS = [ (r"(?i)^(.+?)\s*\((?:folge|band|teil|nr\.?|#|episode)\s*(\d+)\)", None), ] +# Ohne Folgenummer (nur Serie erkennen) +SERIES_PATTERNS_SERIES_ONLY = [ + (r"(?i)^(die drei \?\?\?|die drei fragezeichen|drei fragezeichen)", "Die drei ???"), + (r"(?i)^(tkkg)\b", "TKKG"), + (r"(?i)^(fünf freunde|funf freunde)", "Fünf Freunde"), + (r"(?i)^(bibi blocksberg)", "Bibi Blocksberg"), + (r"(?i)^(benjamin blümchen|benjamin blumchen)", "Benjamin Blümchen"), + (r"(?i)^(bibi und tina)", "Bibi und Tina"), + (r"(?i)^(der kleine vampir)", "Der kleine Vampir"), +] + def detect_series(title: str) -> tuple[str | None, str | None]: - for pattern, canonical_name in SERIES_PATTERNS: - m = re.match(pattern, title.strip()) + t = title.strip() + for pattern, canonical_name in SERIES_PATTERNS_WITH_EPISODE: + m = re.match(pattern, t) if m: - series = canonical_name or m.group(1).strip() - episode = m.group(2) - return series, episode + return (canonical_name or m.group(1).strip(), m.group(2)) + for pattern, canonical_name in SERIES_PATTERNS_SERIES_ONLY: + m = re.match(pattern, t) + if m: + return (canonical_name or m.group(1).strip(), None) return None, None +def _build_search_title(original: str) -> str: + """Bereinigt Titel für Such-APIs: ??? raus, Sonderzeichen, Klammer-Suffixe.""" + t = original + t = re.sub(r"\?{2,}", "", t) + t = re.sub(r"\s*\([^)]*\)\s*$", "", t) + t = re.sub(r"[_\-–]+", " ", t) + t = re.sub(r"\s+", " ", t).strip() + return t + + def _title_similarity(a: str, b: str) -> float: """Wort-Überlapp mit Min/Max-Gewichtung — lenient für Teil-Treffer.""" if not a or not b: @@ -215,15 +240,20 @@ async def match_audiobook(item_id: str): author = item.author series, episode = detect_series(title) - search_title = title if series: - search_title = f"{series} {episode}" if episode else series + if episode: + search_title = f"{series} {episode}" + else: + # Serie erkannt, keine Folgennummer → kompletten Titel suchen + search_title = _build_search_title(title) if not item.series: item.series = series if not item.series_sequence and episode: item.series_sequence = episode + else: + search_title = _build_search_title(title) - logger.info(f"Matche: '{title}' (Such-Titel: '{search_title}') | Quellen: {sources}") + logger.info(f"Matche: orig='{title}' suchTitel='{search_title}' author={author!r} | Quellen: {sources}") best: MatchResult | None = None best_score = 0.0 @@ -274,18 +304,23 @@ async def match_audiobook(item_id: str): async def search_for_item(title: str, author: str | None = None) -> list[dict]: """Suche über alle Quellen – für manuelles Matching. Gibt alle relevanten Felder zurück.""" - async def _search_source(coro): + search_title = _build_search_title(title) + logger.info(f"Manuelle Suche: orig='{title}' bereinigt='{search_title}' author={author!r}") + + async def _search_source(name: str, coro): try: - return await coro + r = await coro + logger.info(f"Manuelle Suche {name}: {len(r)} Treffer") + return r except Exception as e: - logger.warning(f"Such-Fehler: {e}") + logger.warning(f"Manuelle Suche {name} Fehler: {e}") return [] mb, ol, gb, dnb = await asyncio.gather( - _search_source(search_musicbrainz(title, author)), - _search_source(search_open_library(title, author)), - _search_source(search_google_books(title, author)), - _search_source(search_dnb(title, author)), + _search_source("musicbrainz", search_musicbrainz(search_title, author)), + _search_source("open_library", search_open_library(search_title, author)), + _search_source("google_books", search_google_books(search_title, author)), + _search_source("dnb", search_dnb(search_title, author)), ) results = [] diff --git a/backend/app/services/matching/dnb.py b/backend/app/services/matching/dnb.py index 3c6f402..a2eba7a 100644 --- a/backend/app/services/matching/dnb.py +++ b/backend/app/services/matching/dnb.py @@ -1,24 +1,67 @@ """ Deutsche Nationalbibliothek (DNB) SRU-Schnittstelle. -Sucht Hörbücher (mat=ton) über MARC21-XML. +Mehrere Query-Strategien mit Fallback; ausführliches Logging. """ import re +import logging import httpx from xml.etree import ElementTree as ET from .base import MatchResult +logger = logging.getLogger(__name__) + DNB_SRU = "https://services.dnb.de/sru/dnb" HEADERS = {"User-Agent": "audiolib/1.0 (contact@audiolib.local)"} _NS_SRW = "http://www.loc.gov/zing/srw/" _NS_MARC = "http://www.loc.gov/MARC21/slim" +# CQL Wildcards / Sonderzeichen die wir aus Such-Titeln entfernen +_CQL_STRIP = re.compile(r"[?*<>=/\"']") +_WHITESPACE = re.compile(r"\s+") + + +def _norm_for_query(text: str) -> str: + """Entfernt CQL-Sonderzeichen und Doppelspaces.""" + out = _CQL_STRIP.sub(" ", text) + out = _WHITESPACE.sub(" ", out).strip() + return out + async def search_dnb(title: str, author: str | None = None) -> list[MatchResult]: - parts = [f'tit="{title}"', "mat=ton"] - if author: - parts.append(f'per="{author}"') - query = " AND ".join(parts) + """Mehrere Query-Strategien, gibt beim ersten Erfolg zurück.""" + norm_title = _norm_for_query(title) + norm_author = _norm_for_query(author) if author else None + if not norm_title: + logger.info("DNB: leerer Titel nach Normalisierung") + return [] + + queries: list[str] = [] + # 1) Titel + Autor (mit Hörbuch-Filter) + if norm_author: + queries.append(f'tit="{norm_title}" AND per="{norm_author}" AND mat=ton') + queries.append(f'tit="{norm_title}" AND per="{norm_author}"') + # 2) Nur Titel (mit Hörbuch-Filter) + queries.append(f'tit="{norm_title}" AND mat=ton') + # 3) Nur Titel ohne Filter + queries.append(f'tit="{norm_title}"') + # 4) Volltext-Fallback + if norm_author: + queries.append(f'{norm_title} {norm_author}') + else: + queries.append(norm_title) + + async with httpx.AsyncClient(headers=HEADERS, timeout=20) as client: + for query in queries: + results = await _dnb_query(client, query) + if results: + logger.info(f"DNB: '{query}' → {len(results)} Treffer") + return results + logger.info(f"DNB: '{query}' → 0 Treffer") + return [] + + +async def _dnb_query(client: httpx.AsyncClient, query: str) -> list[MatchResult]: params = { "version": "1.1", "operation": "searchRetrieve", @@ -26,18 +69,38 @@ async def search_dnb(title: str, author: str | None = None) -> list[MatchResult] "recordSchema": "MARC21-xml", "maximumRecords": "5", } - async with httpx.AsyncClient(headers=HEADERS, timeout=15) as client: - try: - r = await client.get(DNB_SRU, params=params) - r.raise_for_status() - except Exception: - return [] + try: + r = await client.get(DNB_SRU, params=params) + except Exception as e: + logger.warning(f"DNB HTTP-Fehler ({query!r}): {e}") + return [] + + if r.status_code != 200: + snippet = r.text[:200] if r.text else "" + logger.warning(f"DNB HTTP {r.status_code} für {query!r}: {snippet}") + return [] try: root = ET.fromstring(r.text) - except ET.ParseError: + except ET.ParseError as e: + logger.warning(f"DNB XML-Parse-Fehler: {e} — Body: {r.text[:200]}") return [] + # numberOfRecords prüfen + num_elem = root.find(f".//{{{_NS_SRW}}}numberOfRecords") + num = 0 + if num_elem is not None and num_elem.text: + try: + num = int(num_elem.text) + except ValueError: + pass + + # Diagnose-Fehler aus DNB + diag = root.find(f".//{{http://www.loc.gov/zing/srw/diagnostic/}}diagnostic") + if diag is not None: + diag_msg = "".join(diag.itertext()).strip() + logger.warning(f"DNB Diagnose: {diag_msg}") + results = [] for record in root.findall(f".//{{{_NS_SRW}}}record"): marc = record.find(f".//{{{_NS_MARC}}}record") @@ -47,8 +110,8 @@ async def search_dnb(title: str, author: str | None = None) -> list[MatchResult] result = _parse_marc(marc) if result: results.append(result) - except Exception: - continue + except Exception as e: + logger.warning(f"DNB MARC-Parse-Fehler: {e}") return results @@ -86,23 +149,31 @@ def _parse_marc(marc) -> MatchResult | None: author = _field(marc, "100", "a") if author: author = author.rstrip(",").strip() + # DNB-Format "Nachname, Vorname" → "Vorname Nachname" + if "," in author: + parts = [p.strip() for p in author.split(",", 1)] + if len(parts) == 2: + author = f"{parts[1]} {parts[0]}" - # Sprecher aus 700 $e = "Sprecher" oder $4 = "spk" narrator = None for f in marc.findall(f"{{{_NS_MARC}}}datafield[@tag='700']"): e_sf = f.find(f"{{{_NS_MARC}}}subfield[@code='e']") r_sf = f.find(f"{{{_NS_MARC}}}subfield[@code='4']") is_narrator = ( - (e_sf is not None and e_sf.text and "prech" in e_sf.text.lower()) - or (r_sf is not None and r_sf.text == "spk") + (e_sf is not None and e_sf.text and ("prech" in e_sf.text.lower() or "erzähl" in e_sf.text.lower())) + or (r_sf is not None and r_sf.text in ("spk", "nrt")) ) if is_narrator: n_sf = f.find(f"{{{_NS_MARC}}}subfield[@code='a']") if n_sf is not None and n_sf.text: narrator = n_sf.text.rstrip(",").strip() + if "," in narrator: + parts = [p.strip() for p in narrator.split(",", 1)] + if len(parts) == 2: + narrator = f"{parts[1]} {parts[0]}" break - publisher = (_field(marc, "264", "b") or "").rstrip(",").strip() or None + publisher = (_field(marc, "264", "b") or _field(marc, "260", "b") or "").rstrip(",").strip() or None year_raw = _field(marc, "264", "c") or _field(marc, "260", "c") publish_year = None if year_raw: @@ -114,14 +185,13 @@ def _parse_marc(marc) -> MatchResult | None: language = _field(marc, "041", "a") genres = _fields(marc, "650", "a")[:5] - series = _field(marc, "830", "a") or _field(marc, "800", "t") - series_seq = _field(marc, "830", "v") or _field(marc, "800", "v") + series = _field(marc, "830", "a") or _field(marc, "800", "t") or _field(marc, "490", "a") + series_seq = _field(marc, "830", "v") or _field(marc, "800", "v") or _field(marc, "490", "v") - # DNB-ID aus Kontrollfeld 001 ctrl = marc.find(f"{{{_NS_MARC}}}controlfield[@tag='001']") dnb_id = ctrl.text.strip() if ctrl is not None and ctrl.text else None - # ISBN für Cover + # ISBN für Cover (sehr unzuverlässig bei DNB-Hörbüchern) isbn_raw = _field(marc, "020", "a") or "" isbn = re.sub(r"[^0-9X]", "", isbn_raw.split()[0]) if isbn_raw else None cover_url = f"https://portal.dnb.de/opac/mvb/cover?isbn={isbn}" if isbn else None diff --git a/frontend/src/api/items.ts b/frontend/src/api/items.ts index 6a2be15..0613b12 100644 --- a/frontend/src/api/items.ts +++ b/frontend/src/api/items.ts @@ -27,4 +27,10 @@ export const applyMatch = (id: string, match: object) => export const triggerMatch = (id: string) => api.post(`/api/items/${id}/match`).then((r) => r.data) +export const debugMatch = (title: string, author?: string) => + api.get(`/api/items/match/debug`, { params: { title, author } }).then((r) => r.data) + +export const extractCover = (id: string) => + api.post(`/api/items/${id}/extract-cover`).then((r) => r.data) + export const coverUrl = (id: string) => `/api/items/${id}/cover` diff --git a/frontend/src/pages/BookDetail.tsx b/frontend/src/pages/BookDetail.tsx index ee256a0..e94c7bc 100644 --- a/frontend/src/pages/BookDetail.tsx +++ b/frontend/src/pages/BookDetail.tsx @@ -4,7 +4,7 @@ import { Play, ArrowLeft, RefreshCw, Search, Check, Loader2, Trash2, X } from 'lucide-react' -import { getItem, updateItem, triggerMatch, searchMatch, applyMatch, coverUrl } from '../api/items' +import { getItem, updateItem, triggerMatch, searchMatch, applyMatch, debugMatch, extractCover, coverUrl } from '../api/items' import { getMe, createBookmark, deleteBookmark } from '../api/me' import { usePlayerStore } from '../store/playerStore' import CoverImage from '../components/common/CoverImage' @@ -21,6 +21,8 @@ export default function BookDetail() { const [matchQuery, setMatchQuery] = useState('') const [matchLoading, setMatchLoading] = useState(false) const [showMatchPanel, setShowMatchPanel] = useState(false) + const [debugData, setDebugData] = useState(null) + const [debugLoading, setDebugLoading] = useState(false) const { play, item: currentItem, currentTime } = usePlayerStore() useEffect(() => { @@ -79,6 +81,27 @@ export default function BookDetail() { }, 3000) } + const handleDebug = async () => { + setDebugLoading(true) + try { + const data = await debugMatch(title, author || undefined) + setDebugData(data) + } finally { + setDebugLoading(false) + } + } + + const handleExtractCover = async () => { + if (!id) return + const res = await extractCover(id) + if (res.success) { + const updated = await getItem(id) + setItem(updated) + } else { + alert('Kein lokales Cover gefunden') + } + } + const fmtTime = (s: number) => { const h = Math.floor(s / 3600) const m = Math.floor((s % 3600) / 60) @@ -167,10 +190,59 @@ export default function BookDetail() { Auto-Match + + + {debugData && ( +
+
+

API-Debug

+ +
+

+ Such-Titel (bereinigt): {debugData.normalized?.search_title} + {debugData.normalized?.series && <> · Serie: {debugData.normalized.series}} + {debugData.normalized?.episode && <> · Folge: {debugData.normalized.episode}} +

+ {debugData.sources?.map((s: any) => ( +
+

+ {s.source} — {s.ok ? `${s.count} Treffer` : Fehler: {s.error}} +

+ {s.ok && s.results?.slice(0, 3).map((r: any, i: number) => ( +
+

{r.title} {r.author && `— ${r.author}`} {r.year && `(${r.year})`}

+ {r.narrator &&

Sprecher: {r.narrator}

} + {r.publisher &&

Verlag: {r.publisher}

} + {r.series &&

Serie: {r.series} {r.series_sequence && `#${r.series_sequence}`}

} + {r.cover_url &&

Cover: {r.cover_url}

} +

conf={r.confidence} id={r.source_id}

+
+ ))} +
+ ))} +
+ )} + {/* Description */} {meta.description && (