Files
Audiolib e3e6492b1f Make matching debuggable + fix metadata search blockers
DNB rewrite:
- Multiple query strategies with fallback (title+author+mat=ton →
  title+author → title+mat=ton → title-only → fulltext). Returns on
  first hit. Most German audiobooks aren't tagged mat=ton in DNB,
  which was killing all searches.
- Strip CQL wildcard chars (?, *, <, >, =, /, quotes) from search
  terms. The "???" in "Die drei ???" was breaking the CQL parser.
- Log HTTP status, body snippet on non-200, and numberOfRecords on
  every query so log shows exactly what DNB returned.
- Parse SRU diagnostic elements (DNB error messages buried in XML).
- Convert author/narrator from "Lastname, Firstname" to
  "Firstname Lastname" for consistency with other sources.

Matcher:
- Split series patterns: WITH_EPISODE (need digit) and SERIES_ONLY
  (just the series name). "Die drei ??? und der Fluch des Rubins"
  now properly detects "Die drei ???" as series even without folge#.
- New _build_search_title: removes ??? sequences, trailing parens,
  collapses whitespace, before sending to APIs.
- Manual search also passes through normalization. Logs source +
  hit count per query.

Debug endpoint:
- GET /api/items/match/debug?title=...&author=... returns raw results
  from all 4 sources with status, error messages, and full metadata.
- "Debug" button added in BookDetail — shows what each API actually
  returns inline, so the user can see if it's a search problem,
  parse problem, or threshold problem.
- "Cover aus Datei" button — triggers local cover extraction
  (folder.jpg or embedded artwork) on demand.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-26 18:34:49 +02:00

216 lines
7.4 KiB
Python

"""
Deutsche Nationalbibliothek (DNB) SRU-Schnittstelle.
Mehrere Query-Strategien mit Fallback; ausführliches Logging.
"""
import re
import logging
import httpx
from xml.etree import ElementTree as ET
from .base import MatchResult
logger = logging.getLogger(__name__)
DNB_SRU = "https://services.dnb.de/sru/dnb"
HEADERS = {"User-Agent": "audiolib/1.0 (contact@audiolib.local)"}
_NS_SRW = "http://www.loc.gov/zing/srw/"
_NS_MARC = "http://www.loc.gov/MARC21/slim"
# CQL Wildcards / Sonderzeichen die wir aus Such-Titeln entfernen
_CQL_STRIP = re.compile(r"[?*<>=/\"']")
_WHITESPACE = re.compile(r"\s+")
def _norm_for_query(text: str) -> str:
"""Entfernt CQL-Sonderzeichen und Doppelspaces."""
out = _CQL_STRIP.sub(" ", text)
out = _WHITESPACE.sub(" ", out).strip()
return out
async def search_dnb(title: str, author: str | None = None) -> list[MatchResult]:
"""Mehrere Query-Strategien, gibt beim ersten Erfolg zurück."""
norm_title = _norm_for_query(title)
norm_author = _norm_for_query(author) if author else None
if not norm_title:
logger.info("DNB: leerer Titel nach Normalisierung")
return []
queries: list[str] = []
# 1) Titel + Autor (mit Hörbuch-Filter)
if norm_author:
queries.append(f'tit="{norm_title}" AND per="{norm_author}" AND mat=ton')
queries.append(f'tit="{norm_title}" AND per="{norm_author}"')
# 2) Nur Titel (mit Hörbuch-Filter)
queries.append(f'tit="{norm_title}" AND mat=ton')
# 3) Nur Titel ohne Filter
queries.append(f'tit="{norm_title}"')
# 4) Volltext-Fallback
if norm_author:
queries.append(f'{norm_title} {norm_author}')
else:
queries.append(norm_title)
async with httpx.AsyncClient(headers=HEADERS, timeout=20) as client:
for query in queries:
results = await _dnb_query(client, query)
if results:
logger.info(f"DNB: '{query}'{len(results)} Treffer")
return results
logger.info(f"DNB: '{query}' → 0 Treffer")
return []
async def _dnb_query(client: httpx.AsyncClient, query: str) -> list[MatchResult]:
params = {
"version": "1.1",
"operation": "searchRetrieve",
"query": query,
"recordSchema": "MARC21-xml",
"maximumRecords": "5",
}
try:
r = await client.get(DNB_SRU, params=params)
except Exception as e:
logger.warning(f"DNB HTTP-Fehler ({query!r}): {e}")
return []
if r.status_code != 200:
snippet = r.text[:200] if r.text else ""
logger.warning(f"DNB HTTP {r.status_code} für {query!r}: {snippet}")
return []
try:
root = ET.fromstring(r.text)
except ET.ParseError as e:
logger.warning(f"DNB XML-Parse-Fehler: {e} — Body: {r.text[:200]}")
return []
# numberOfRecords prüfen
num_elem = root.find(f".//{{{_NS_SRW}}}numberOfRecords")
num = 0
if num_elem is not None and num_elem.text:
try:
num = int(num_elem.text)
except ValueError:
pass
# Diagnose-Fehler aus DNB
diag = root.find(f".//{{http://www.loc.gov/zing/srw/diagnostic/}}diagnostic")
if diag is not None:
diag_msg = "".join(diag.itertext()).strip()
logger.warning(f"DNB Diagnose: {diag_msg}")
results = []
for record in root.findall(f".//{{{_NS_SRW}}}record"):
marc = record.find(f".//{{{_NS_MARC}}}record")
if marc is None:
continue
try:
result = _parse_marc(marc)
if result:
results.append(result)
except Exception as e:
logger.warning(f"DNB MARC-Parse-Fehler: {e}")
return results
def _field(marc, tag: str, code: str | None = None) -> str | None:
for f in marc.findall(f"{{{_NS_MARC}}}datafield[@tag='{tag}']"):
if code:
sf = f.find(f"{{{_NS_MARC}}}subfield[@code='{code}']")
if sf is not None and sf.text:
return sf.text.strip()
else:
parts = [sf.text.strip() for sf in f.findall(f"{{{_NS_MARC}}}subfield") if sf.text]
if parts:
return " ".join(parts)
return None
def _fields(marc, tag: str, code: str) -> list[str]:
out = []
for f in marc.findall(f"{{{_NS_MARC}}}datafield[@tag='{tag}']"):
sf = f.find(f"{{{_NS_MARC}}}subfield[@code='{code}']")
if sf is not None and sf.text:
out.append(sf.text.strip())
return out
def _parse_marc(marc) -> MatchResult | None:
title_a = (_field(marc, "245", "a") or "").rstrip("/ ").strip()
title_b = _field(marc, "245", "b")
title = (title_a + " " + title_b.rstrip("/ ").strip()).strip() if title_b else title_a
if not title:
return None
subtitle = title_b.rstrip("/ ").strip() if title_b else None
author = _field(marc, "100", "a")
if author:
author = author.rstrip(",").strip()
# DNB-Format "Nachname, Vorname" → "Vorname Nachname"
if "," in author:
parts = [p.strip() for p in author.split(",", 1)]
if len(parts) == 2:
author = f"{parts[1]} {parts[0]}"
narrator = None
for f in marc.findall(f"{{{_NS_MARC}}}datafield[@tag='700']"):
e_sf = f.find(f"{{{_NS_MARC}}}subfield[@code='e']")
r_sf = f.find(f"{{{_NS_MARC}}}subfield[@code='4']")
is_narrator = (
(e_sf is not None and e_sf.text and ("prech" in e_sf.text.lower() or "erzähl" in e_sf.text.lower()))
or (r_sf is not None and r_sf.text in ("spk", "nrt"))
)
if is_narrator:
n_sf = f.find(f"{{{_NS_MARC}}}subfield[@code='a']")
if n_sf is not None and n_sf.text:
narrator = n_sf.text.rstrip(",").strip()
if "," in narrator:
parts = [p.strip() for p in narrator.split(",", 1)]
if len(parts) == 2:
narrator = f"{parts[1]} {parts[0]}"
break
publisher = (_field(marc, "264", "b") or _field(marc, "260", "b") or "").rstrip(",").strip() or None
year_raw = _field(marc, "264", "c") or _field(marc, "260", "c")
publish_year = None
if year_raw:
m = re.search(r"\d{4}", year_raw)
if m:
publish_year = int(m.group())
description = _field(marc, "520", "a")
language = _field(marc, "041", "a")
genres = _fields(marc, "650", "a")[:5]
series = _field(marc, "830", "a") or _field(marc, "800", "t") or _field(marc, "490", "a")
series_seq = _field(marc, "830", "v") or _field(marc, "800", "v") or _field(marc, "490", "v")
ctrl = marc.find(f"{{{_NS_MARC}}}controlfield[@tag='001']")
dnb_id = ctrl.text.strip() if ctrl is not None and ctrl.text else None
# ISBN für Cover (sehr unzuverlässig bei DNB-Hörbüchern)
isbn_raw = _field(marc, "020", "a") or ""
isbn = re.sub(r"[^0-9X]", "", isbn_raw.split()[0]) if isbn_raw else None
cover_url = f"https://portal.dnb.de/opac/mvb/cover?isbn={isbn}" if isbn else None
return MatchResult(
source="dnb",
source_id=dnb_id or f"dnb_{title[:30]}",
title=title,
subtitle=subtitle,
author=author,
narrator=narrator,
description=description,
publisher=publisher,
publish_year=publish_year,
language=language,
genres=genres,
series=series,
series_sequence=series_seq,
cover_url=cover_url,
confidence=0.65,
)