""" Deutsche Nationalbibliothek (DNB) SRU-Schnittstelle. Mehrere Query-Strategien mit Fallback; ausführliches Logging. """ import re import logging import httpx from xml.etree import ElementTree as ET from .base import MatchResult logger = logging.getLogger(__name__) DNB_SRU = "https://services.dnb.de/sru/dnb" HEADERS = {"User-Agent": "audiolib/1.0 (contact@audiolib.local)"} _NS_SRW = "http://www.loc.gov/zing/srw/" _NS_MARC = "http://www.loc.gov/MARC21/slim" # CQL Wildcards / Sonderzeichen die wir aus Such-Titeln entfernen _CQL_STRIP = re.compile(r"[?*<>=/\"']") _WHITESPACE = re.compile(r"\s+") def _norm_for_query(text: str) -> str: """Entfernt CQL-Sonderzeichen und Doppelspaces.""" out = _CQL_STRIP.sub(" ", text) out = _WHITESPACE.sub(" ", out).strip() return out async def search_dnb(title: str, author: str | None = None) -> list[MatchResult]: """Mehrere Query-Strategien, gibt beim ersten Erfolg zurück.""" norm_title = _norm_for_query(title) norm_author = _norm_for_query(author) if author else None if not norm_title: logger.info("DNB: leerer Titel nach Normalisierung") return [] queries: list[str] = [] # 1) Titel + Autor (mit Hörbuch-Filter) if norm_author: queries.append(f'tit="{norm_title}" AND per="{norm_author}" AND mat=ton') queries.append(f'tit="{norm_title}" AND per="{norm_author}"') # 2) Nur Titel (mit Hörbuch-Filter) queries.append(f'tit="{norm_title}" AND mat=ton') # 3) Nur Titel ohne Filter queries.append(f'tit="{norm_title}"') # 4) Volltext-Fallback if norm_author: queries.append(f'{norm_title} {norm_author}') else: queries.append(norm_title) async with httpx.AsyncClient(headers=HEADERS, timeout=20) as client: for query in queries: results = await _dnb_query(client, query) if results: logger.info(f"DNB: '{query}' → {len(results)} Treffer") return results logger.info(f"DNB: '{query}' → 0 Treffer") return [] async def _dnb_query(client: httpx.AsyncClient, query: str) -> list[MatchResult]: params = { "version": "1.1", "operation": "searchRetrieve", "query": query, "recordSchema": "MARC21-xml", "maximumRecords": "5", } try: r = await client.get(DNB_SRU, params=params) except Exception as e: logger.warning(f"DNB HTTP-Fehler ({query!r}): {e}") return [] if r.status_code != 200: snippet = r.text[:200] if r.text else "" logger.warning(f"DNB HTTP {r.status_code} für {query!r}: {snippet}") return [] try: root = ET.fromstring(r.text) except ET.ParseError as e: logger.warning(f"DNB XML-Parse-Fehler: {e} — Body: {r.text[:200]}") return [] # numberOfRecords prüfen num_elem = root.find(f".//{{{_NS_SRW}}}numberOfRecords") num = 0 if num_elem is not None and num_elem.text: try: num = int(num_elem.text) except ValueError: pass # Diagnose-Fehler aus DNB diag = root.find(f".//{{http://www.loc.gov/zing/srw/diagnostic/}}diagnostic") if diag is not None: diag_msg = "".join(diag.itertext()).strip() logger.warning(f"DNB Diagnose: {diag_msg}") results = [] for record in root.findall(f".//{{{_NS_SRW}}}record"): marc = record.find(f".//{{{_NS_MARC}}}record") if marc is None: continue try: result = _parse_marc(marc) if result: results.append(result) except Exception as e: logger.warning(f"DNB MARC-Parse-Fehler: {e}") return results def _field(marc, tag: str, code: str | None = None) -> str | None: for f in marc.findall(f"{{{_NS_MARC}}}datafield[@tag='{tag}']"): if code: sf = f.find(f"{{{_NS_MARC}}}subfield[@code='{code}']") if sf is not None and sf.text: return sf.text.strip() else: parts = [sf.text.strip() for sf in f.findall(f"{{{_NS_MARC}}}subfield") if sf.text] if parts: return " ".join(parts) return None def _fields(marc, tag: str, code: str) -> list[str]: out = [] for f in marc.findall(f"{{{_NS_MARC}}}datafield[@tag='{tag}']"): sf = f.find(f"{{{_NS_MARC}}}subfield[@code='{code}']") if sf is not None and sf.text: out.append(sf.text.strip()) return out def _parse_marc(marc) -> MatchResult | None: title_a = (_field(marc, "245", "a") or "").rstrip("/ ").strip() title_b = _field(marc, "245", "b") title = (title_a + " " + title_b.rstrip("/ ").strip()).strip() if title_b else title_a if not title: return None subtitle = title_b.rstrip("/ ").strip() if title_b else None author = _field(marc, "100", "a") if author: author = author.rstrip(",").strip() # DNB-Format "Nachname, Vorname" → "Vorname Nachname" if "," in author: parts = [p.strip() for p in author.split(",", 1)] if len(parts) == 2: author = f"{parts[1]} {parts[0]}" narrator = None for f in marc.findall(f"{{{_NS_MARC}}}datafield[@tag='700']"): e_sf = f.find(f"{{{_NS_MARC}}}subfield[@code='e']") r_sf = f.find(f"{{{_NS_MARC}}}subfield[@code='4']") is_narrator = ( (e_sf is not None and e_sf.text and ("prech" in e_sf.text.lower() or "erzähl" in e_sf.text.lower())) or (r_sf is not None and r_sf.text in ("spk", "nrt")) ) if is_narrator: n_sf = f.find(f"{{{_NS_MARC}}}subfield[@code='a']") if n_sf is not None and n_sf.text: narrator = n_sf.text.rstrip(",").strip() if "," in narrator: parts = [p.strip() for p in narrator.split(",", 1)] if len(parts) == 2: narrator = f"{parts[1]} {parts[0]}" break publisher = (_field(marc, "264", "b") or _field(marc, "260", "b") or "").rstrip(",").strip() or None year_raw = _field(marc, "264", "c") or _field(marc, "260", "c") publish_year = None if year_raw: m = re.search(r"\d{4}", year_raw) if m: publish_year = int(m.group()) description = _field(marc, "520", "a") language = _field(marc, "041", "a") genres = _fields(marc, "650", "a")[:5] series = _field(marc, "830", "a") or _field(marc, "800", "t") or _field(marc, "490", "a") series_seq = _field(marc, "830", "v") or _field(marc, "800", "v") or _field(marc, "490", "v") ctrl = marc.find(f"{{{_NS_MARC}}}controlfield[@tag='001']") dnb_id = ctrl.text.strip() if ctrl is not None and ctrl.text else None # ISBN für Cover (sehr unzuverlässig bei DNB-Hörbüchern) isbn_raw = _field(marc, "020", "a") or "" isbn = re.sub(r"[^0-9X]", "", isbn_raw.split()[0]) if isbn_raw else None cover_url = f"https://portal.dnb.de/opac/mvb/cover?isbn={isbn}" if isbn else None return MatchResult( source="dnb", source_id=dnb_id or f"dnb_{title[:30]}", title=title, subtitle=subtitle, author=author, narrator=narrator, description=description, publisher=publisher, publish_year=publish_year, language=language, genres=genres, series=series, series_sequence=series_seq, cover_url=cover_url, confidence=0.65, )