Audiolib/backend/app/services/matching/dnb.py

"""
Deutsche Nationalbibliothek (DNB) SRU-Schnittstelle.
Mehrere Query-Strategien mit Fallback; ausführliches Logging.
"""
import re
import logging
import httpx
from xml.etree import ElementTree as ET
from .base import MatchResult

logger = logging.getLogger(__name__)

DNB_SRU = "https://services.dnb.de/sru/dnb"
HEADERS = {"User-Agent": "audiolib/1.0 (contact@audiolib.local)"}
_NS_SRW = "http://www.loc.gov/zing/srw/"
_NS_MARC = "http://www.loc.gov/MARC21/slim"

# CQL Wildcards / Sonderzeichen die wir aus Such-Titeln entfernen
_CQL_STRIP = re.compile(r"[?*<>=/\"']")
_WHITESPACE = re.compile(r"\s+")


def _norm_for_query(text: str) -> str:
    """Entfernt CQL-Sonderzeichen und Doppelspaces."""
    out = _CQL_STRIP.sub(" ", text)
    out = _WHITESPACE.sub(" ", out).strip()
    return out


async def search_dnb(title: str, author: str | None = None) -> list[MatchResult]:
    """Mehrere Query-Strategien, gibt beim ersten Erfolg zurück."""
    norm_title = _norm_for_query(title)
    norm_author = _norm_for_query(author) if author else None

    if not norm_title:
        logger.info("DNB: leerer Titel nach Normalisierung")
        return []

    queries: list[str] = []
    # 1) Titel + Autor (mit Hörbuch-Filter)
    if norm_author:
        queries.append(f'tit="{norm_title}" AND per="{norm_author}" AND mat=ton')
        queries.append(f'tit="{norm_title}" AND per="{norm_author}"')
    # 2) Nur Titel (mit Hörbuch-Filter)
    queries.append(f'tit="{norm_title}" AND mat=ton')
    # 3) Nur Titel ohne Filter
    queries.append(f'tit="{norm_title}"')
    # 4) Volltext-Fallback
    if norm_author:
        queries.append(f'{norm_title} {norm_author}')
    else:
        queries.append(norm_title)

    async with httpx.AsyncClient(headers=HEADERS, timeout=20) as client:
        for query in queries:
            results = await _dnb_query(client, query)
            if results:
                logger.info(f"DNB: '{query}' → {len(results)} Treffer")
                return results
            logger.info(f"DNB: '{query}' → 0 Treffer")
    return []


async def _dnb_query(client: httpx.AsyncClient, query: str) -> list[MatchResult]:
    params = {
        "version": "1.1",
        "operation": "searchRetrieve",
        "query": query,
        "recordSchema": "MARC21-xml",
        "maximumRecords": "5",
    }
    try:
        r = await client.get(DNB_SRU, params=params)
    except Exception as e:
        logger.warning(f"DNB HTTP-Fehler ({query!r}): {e}")
        return []

    if r.status_code != 200:
        snippet = r.text[:200] if r.text else ""
        logger.warning(f"DNB HTTP {r.status_code} für {query!r}: {snippet}")
        return []

    try:
        root = ET.fromstring(r.text)
    except ET.ParseError as e:
        logger.warning(f"DNB XML-Parse-Fehler: {e} — Body: {r.text[:200]}")
        return []

    # numberOfRecords prüfen
    num_elem = root.find(f".//{{{_NS_SRW}}}numberOfRecords")
    num = 0
    if num_elem is not None and num_elem.text:
        try:
            num = int(num_elem.text)
        except ValueError:
            pass

    # Diagnose-Fehler aus DNB
    diag = root.find(f".//{{http://www.loc.gov/zing/srw/diagnostic/}}diagnostic")
    if diag is not None:
        diag_msg = "".join(diag.itertext()).strip()
        logger.warning(f"DNB Diagnose: {diag_msg}")

    results = []
    for record in root.findall(f".//{{{_NS_SRW}}}record"):
        marc = record.find(f".//{{{_NS_MARC}}}record")
        if marc is None:
            continue
        try:
            result = _parse_marc(marc)
            if result:
                results.append(result)
        except Exception as e:
            logger.warning(f"DNB MARC-Parse-Fehler: {e}")
    return results


def _field(marc, tag: str, code: str | None = None) -> str | None:
    for f in marc.findall(f"{{{_NS_MARC}}}datafield[@tag='{tag}']"):
        if code:
            sf = f.find(f"{{{_NS_MARC}}}subfield[@code='{code}']")
            if sf is not None and sf.text:
                return sf.text.strip()
        else:
            parts = [sf.text.strip() for sf in f.findall(f"{{{_NS_MARC}}}subfield") if sf.text]
            if parts:
                return " ".join(parts)
    return None


def _fields(marc, tag: str, code: str) -> list[str]:
    out = []
    for f in marc.findall(f"{{{_NS_MARC}}}datafield[@tag='{tag}']"):
        sf = f.find(f"{{{_NS_MARC}}}subfield[@code='{code}']")
        if sf is not None and sf.text:
            out.append(sf.text.strip())
    return out


def _parse_marc(marc) -> MatchResult | None:
    title_a = (_field(marc, "245", "a") or "").rstrip("/ ").strip()
    title_b = _field(marc, "245", "b")
    title = (title_a + " " + title_b.rstrip("/ ").strip()).strip() if title_b else title_a
    if not title:
        return None

    subtitle = title_b.rstrip("/ ").strip() if title_b else None

    author = _field(marc, "100", "a")
    if author:
        author = author.rstrip(",").strip()
        # DNB-Format "Nachname, Vorname" → "Vorname Nachname"
        if "," in author:
            parts = [p.strip() for p in author.split(",", 1)]
            if len(parts) == 2:
                author = f"{parts[1]} {parts[0]}"

    narrator = None
    for f in marc.findall(f"{{{_NS_MARC}}}datafield[@tag='700']"):
        e_sf = f.find(f"{{{_NS_MARC}}}subfield[@code='e']")
        r_sf = f.find(f"{{{_NS_MARC}}}subfield[@code='4']")
        is_narrator = (
            (e_sf is not None and e_sf.text and ("prech" in e_sf.text.lower() or "erzähl" in e_sf.text.lower()))
            or (r_sf is not None and r_sf.text in ("spk", "nrt"))
        )
        if is_narrator:
            n_sf = f.find(f"{{{_NS_MARC}}}subfield[@code='a']")
            if n_sf is not None and n_sf.text:
                narrator = n_sf.text.rstrip(",").strip()
                if "," in narrator:
                    parts = [p.strip() for p in narrator.split(",", 1)]
                    if len(parts) == 2:
                        narrator = f"{parts[1]} {parts[0]}"
                break

    publisher = (_field(marc, "264", "b") or _field(marc, "260", "b") or "").rstrip(",").strip() or None
    year_raw = _field(marc, "264", "c") or _field(marc, "260", "c")
    publish_year = None
    if year_raw:
        m = re.search(r"\d{4}", year_raw)
        if m:
            publish_year = int(m.group())

    description = _field(marc, "520", "a")
    language = _field(marc, "041", "a")
    genres = _fields(marc, "650", "a")[:5]

    series = _field(marc, "830", "a") or _field(marc, "800", "t") or _field(marc, "490", "a")
    series_seq = _field(marc, "830", "v") or _field(marc, "800", "v") or _field(marc, "490", "v")

    ctrl = marc.find(f"{{{_NS_MARC}}}controlfield[@tag='001']")
    dnb_id = ctrl.text.strip() if ctrl is not None and ctrl.text else None

    # ISBN für Cover (sehr unzuverlässig bei DNB-Hörbüchern)
    isbn_raw = _field(marc, "020", "a") or ""
    isbn = re.sub(r"[^0-9X]", "", isbn_raw.split()[0]) if isbn_raw else None
    cover_url = f"https://portal.dnb.de/opac/mvb/cover?isbn={isbn}" if isbn else None

    return MatchResult(
        source="dnb",
        source_id=dnb_id or f"dnb_{title[:30]}",
        title=title,
        subtitle=subtitle,
        author=author,
        narrator=narrator,
        description=description,
        publisher=publisher,
        publish_year=publish_year,
        language=language,
        genres=genres,
        series=series,
        series_sequence=series_seq,
        cover_url=cover_url,
        confidence=0.65,
    )