DNB rewrite: - Multiple query strategies with fallback (title+author+mat=ton → title+author → title+mat=ton → title-only → fulltext). Returns on first hit. Most German audiobooks aren't tagged mat=ton in DNB, which was killing all searches. - Strip CQL wildcard chars (?, *, <, >, =, /, quotes) from search terms. The "???" in "Die drei ???" was breaking the CQL parser. - Log HTTP status, body snippet on non-200, and numberOfRecords on every query so log shows exactly what DNB returned. - Parse SRU diagnostic elements (DNB error messages buried in XML). - Convert author/narrator from "Lastname, Firstname" to "Firstname Lastname" for consistency with other sources. Matcher: - Split series patterns: WITH_EPISODE (need digit) and SERIES_ONLY (just the series name). "Die drei ??? und der Fluch des Rubins" now properly detects "Die drei ???" as series even without folge#. - New _build_search_title: removes ??? sequences, trailing parens, collapses whitespace, before sending to APIs. - Manual search also passes through normalization. Logs source + hit count per query. Debug endpoint: - GET /api/items/match/debug?title=...&author=... returns raw results from all 4 sources with status, error messages, and full metadata. - "Debug" button added in BookDetail — shows what each API actually returns inline, so the user can see if it's a search problem, parse problem, or threshold problem. - "Cover aus Datei" button — triggers local cover extraction (folder.jpg or embedded artwork) on demand. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
216 lines
7.4 KiB
Python
216 lines
7.4 KiB
Python
"""
|
|
Deutsche Nationalbibliothek (DNB) SRU-Schnittstelle.
|
|
Mehrere Query-Strategien mit Fallback; ausführliches Logging.
|
|
"""
|
|
import re
|
|
import logging
|
|
import httpx
|
|
from xml.etree import ElementTree as ET
|
|
from .base import MatchResult
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
DNB_SRU = "https://services.dnb.de/sru/dnb"
|
|
HEADERS = {"User-Agent": "audiolib/1.0 (contact@audiolib.local)"}
|
|
_NS_SRW = "http://www.loc.gov/zing/srw/"
|
|
_NS_MARC = "http://www.loc.gov/MARC21/slim"
|
|
|
|
# CQL Wildcards / Sonderzeichen die wir aus Such-Titeln entfernen
|
|
_CQL_STRIP = re.compile(r"[?*<>=/\"']")
|
|
_WHITESPACE = re.compile(r"\s+")
|
|
|
|
|
|
def _norm_for_query(text: str) -> str:
|
|
"""Entfernt CQL-Sonderzeichen und Doppelspaces."""
|
|
out = _CQL_STRIP.sub(" ", text)
|
|
out = _WHITESPACE.sub(" ", out).strip()
|
|
return out
|
|
|
|
|
|
async def search_dnb(title: str, author: str | None = None) -> list[MatchResult]:
|
|
"""Mehrere Query-Strategien, gibt beim ersten Erfolg zurück."""
|
|
norm_title = _norm_for_query(title)
|
|
norm_author = _norm_for_query(author) if author else None
|
|
|
|
if not norm_title:
|
|
logger.info("DNB: leerer Titel nach Normalisierung")
|
|
return []
|
|
|
|
queries: list[str] = []
|
|
# 1) Titel + Autor (mit Hörbuch-Filter)
|
|
if norm_author:
|
|
queries.append(f'tit="{norm_title}" AND per="{norm_author}" AND mat=ton')
|
|
queries.append(f'tit="{norm_title}" AND per="{norm_author}"')
|
|
# 2) Nur Titel (mit Hörbuch-Filter)
|
|
queries.append(f'tit="{norm_title}" AND mat=ton')
|
|
# 3) Nur Titel ohne Filter
|
|
queries.append(f'tit="{norm_title}"')
|
|
# 4) Volltext-Fallback
|
|
if norm_author:
|
|
queries.append(f'{norm_title} {norm_author}')
|
|
else:
|
|
queries.append(norm_title)
|
|
|
|
async with httpx.AsyncClient(headers=HEADERS, timeout=20) as client:
|
|
for query in queries:
|
|
results = await _dnb_query(client, query)
|
|
if results:
|
|
logger.info(f"DNB: '{query}' → {len(results)} Treffer")
|
|
return results
|
|
logger.info(f"DNB: '{query}' → 0 Treffer")
|
|
return []
|
|
|
|
|
|
async def _dnb_query(client: httpx.AsyncClient, query: str) -> list[MatchResult]:
|
|
params = {
|
|
"version": "1.1",
|
|
"operation": "searchRetrieve",
|
|
"query": query,
|
|
"recordSchema": "MARC21-xml",
|
|
"maximumRecords": "5",
|
|
}
|
|
try:
|
|
r = await client.get(DNB_SRU, params=params)
|
|
except Exception as e:
|
|
logger.warning(f"DNB HTTP-Fehler ({query!r}): {e}")
|
|
return []
|
|
|
|
if r.status_code != 200:
|
|
snippet = r.text[:200] if r.text else ""
|
|
logger.warning(f"DNB HTTP {r.status_code} für {query!r}: {snippet}")
|
|
return []
|
|
|
|
try:
|
|
root = ET.fromstring(r.text)
|
|
except ET.ParseError as e:
|
|
logger.warning(f"DNB XML-Parse-Fehler: {e} — Body: {r.text[:200]}")
|
|
return []
|
|
|
|
# numberOfRecords prüfen
|
|
num_elem = root.find(f".//{{{_NS_SRW}}}numberOfRecords")
|
|
num = 0
|
|
if num_elem is not None and num_elem.text:
|
|
try:
|
|
num = int(num_elem.text)
|
|
except ValueError:
|
|
pass
|
|
|
|
# Diagnose-Fehler aus DNB
|
|
diag = root.find(f".//{{http://www.loc.gov/zing/srw/diagnostic/}}diagnostic")
|
|
if diag is not None:
|
|
diag_msg = "".join(diag.itertext()).strip()
|
|
logger.warning(f"DNB Diagnose: {diag_msg}")
|
|
|
|
results = []
|
|
for record in root.findall(f".//{{{_NS_SRW}}}record"):
|
|
marc = record.find(f".//{{{_NS_MARC}}}record")
|
|
if marc is None:
|
|
continue
|
|
try:
|
|
result = _parse_marc(marc)
|
|
if result:
|
|
results.append(result)
|
|
except Exception as e:
|
|
logger.warning(f"DNB MARC-Parse-Fehler: {e}")
|
|
return results
|
|
|
|
|
|
def _field(marc, tag: str, code: str | None = None) -> str | None:
|
|
for f in marc.findall(f"{{{_NS_MARC}}}datafield[@tag='{tag}']"):
|
|
if code:
|
|
sf = f.find(f"{{{_NS_MARC}}}subfield[@code='{code}']")
|
|
if sf is not None and sf.text:
|
|
return sf.text.strip()
|
|
else:
|
|
parts = [sf.text.strip() for sf in f.findall(f"{{{_NS_MARC}}}subfield") if sf.text]
|
|
if parts:
|
|
return " ".join(parts)
|
|
return None
|
|
|
|
|
|
def _fields(marc, tag: str, code: str) -> list[str]:
|
|
out = []
|
|
for f in marc.findall(f"{{{_NS_MARC}}}datafield[@tag='{tag}']"):
|
|
sf = f.find(f"{{{_NS_MARC}}}subfield[@code='{code}']")
|
|
if sf is not None and sf.text:
|
|
out.append(sf.text.strip())
|
|
return out
|
|
|
|
|
|
def _parse_marc(marc) -> MatchResult | None:
|
|
title_a = (_field(marc, "245", "a") or "").rstrip("/ ").strip()
|
|
title_b = _field(marc, "245", "b")
|
|
title = (title_a + " " + title_b.rstrip("/ ").strip()).strip() if title_b else title_a
|
|
if not title:
|
|
return None
|
|
|
|
subtitle = title_b.rstrip("/ ").strip() if title_b else None
|
|
|
|
author = _field(marc, "100", "a")
|
|
if author:
|
|
author = author.rstrip(",").strip()
|
|
# DNB-Format "Nachname, Vorname" → "Vorname Nachname"
|
|
if "," in author:
|
|
parts = [p.strip() for p in author.split(",", 1)]
|
|
if len(parts) == 2:
|
|
author = f"{parts[1]} {parts[0]}"
|
|
|
|
narrator = None
|
|
for f in marc.findall(f"{{{_NS_MARC}}}datafield[@tag='700']"):
|
|
e_sf = f.find(f"{{{_NS_MARC}}}subfield[@code='e']")
|
|
r_sf = f.find(f"{{{_NS_MARC}}}subfield[@code='4']")
|
|
is_narrator = (
|
|
(e_sf is not None and e_sf.text and ("prech" in e_sf.text.lower() or "erzähl" in e_sf.text.lower()))
|
|
or (r_sf is not None and r_sf.text in ("spk", "nrt"))
|
|
)
|
|
if is_narrator:
|
|
n_sf = f.find(f"{{{_NS_MARC}}}subfield[@code='a']")
|
|
if n_sf is not None and n_sf.text:
|
|
narrator = n_sf.text.rstrip(",").strip()
|
|
if "," in narrator:
|
|
parts = [p.strip() for p in narrator.split(",", 1)]
|
|
if len(parts) == 2:
|
|
narrator = f"{parts[1]} {parts[0]}"
|
|
break
|
|
|
|
publisher = (_field(marc, "264", "b") or _field(marc, "260", "b") or "").rstrip(",").strip() or None
|
|
year_raw = _field(marc, "264", "c") or _field(marc, "260", "c")
|
|
publish_year = None
|
|
if year_raw:
|
|
m = re.search(r"\d{4}", year_raw)
|
|
if m:
|
|
publish_year = int(m.group())
|
|
|
|
description = _field(marc, "520", "a")
|
|
language = _field(marc, "041", "a")
|
|
genres = _fields(marc, "650", "a")[:5]
|
|
|
|
series = _field(marc, "830", "a") or _field(marc, "800", "t") or _field(marc, "490", "a")
|
|
series_seq = _field(marc, "830", "v") or _field(marc, "800", "v") or _field(marc, "490", "v")
|
|
|
|
ctrl = marc.find(f"{{{_NS_MARC}}}controlfield[@tag='001']")
|
|
dnb_id = ctrl.text.strip() if ctrl is not None and ctrl.text else None
|
|
|
|
# ISBN für Cover (sehr unzuverlässig bei DNB-Hörbüchern)
|
|
isbn_raw = _field(marc, "020", "a") or ""
|
|
isbn = re.sub(r"[^0-9X]", "", isbn_raw.split()[0]) if isbn_raw else None
|
|
cover_url = f"https://portal.dnb.de/opac/mvb/cover?isbn={isbn}" if isbn else None
|
|
|
|
return MatchResult(
|
|
source="dnb",
|
|
source_id=dnb_id or f"dnb_{title[:30]}",
|
|
title=title,
|
|
subtitle=subtitle,
|
|
author=author,
|
|
narrator=narrator,
|
|
description=description,
|
|
publisher=publisher,
|
|
publish_year=publish_year,
|
|
language=language,
|
|
genres=genres,
|
|
series=series,
|
|
series_sequence=series_seq,
|
|
cover_url=cover_url,
|
|
confidence=0.65,
|
|
)
|