Make matching debuggable + fix metadata search blockers

DNB rewrite:
- Multiple query strategies with fallback (title+author+mat=ton →
  title+author → title+mat=ton → title-only → fulltext). Returns on
  first hit. Most German audiobooks aren't tagged mat=ton in DNB,
  which was killing all searches.
- Strip CQL wildcard chars (?, *, <, >, =, /, quotes) from search
  terms. The "???" in "Die drei ???" was breaking the CQL parser.
- Log HTTP status, body snippet on non-200, and numberOfRecords on
  every query so log shows exactly what DNB returned.
- Parse SRU diagnostic elements (DNB error messages buried in XML).
- Convert author/narrator from "Lastname, Firstname" to
  "Firstname Lastname" for consistency with other sources.

Matcher:
- Split series patterns: WITH_EPISODE (need digit) and SERIES_ONLY
  (just the series name). "Die drei ??? und der Fluch des Rubins"
  now properly detects "Die drei ???" as series even without folge#.
- New _build_search_title: removes ??? sequences, trailing parens,
  collapses whitespace, before sending to APIs.
- Manual search also passes through normalization. Logs source +
  hit count per query.

Debug endpoint:
- GET /api/items/match/debug?title=...&author=... returns raw results
  from all 4 sources with status, error messages, and full metadata.
- "Debug" button added in BookDetail — shows what each API actually
  returns inline, so the user can see if it's a search problem,
  parse problem, or threshold problem.
- "Cover aus Datei" button — triggers local cover extraction
  (folder.jpg or embedded artwork) on demand.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Audiolib
2026-05-26 18:34:49 +02:00
parent 38f7c9726e
commit e3e6492b1f
5 changed files with 279 additions and 39 deletions

View File

@@ -31,7 +31,8 @@ logger = logging.getLogger(__name__)
AUTO_ACCEPT_THRESHOLD = 0.65
UNCERTAIN_THRESHOLD = 0.40
SERIES_PATTERNS = [
# Mit Folgenummer
SERIES_PATTERNS_WITH_EPISODE = [
(r"(?i)^(die drei \?\?\?|die drei fragezeichen|drei fragezeichen)\s*[-]?\s*(?:folge\s*)?(\d+)", "Die drei ???"),
(r"(?i)^(tkkg)\s*[-]?\s*(?:folge\s*)?(\d+)", "TKKG"),
(r"(?i)^(fünf freunde|funf freunde)\s*[-]?\s*(?:band\s*)?(\d+)", "Fünf Freunde"),
@@ -43,17 +44,41 @@ SERIES_PATTERNS = [
(r"(?i)^(.+?)\s*\((?:folge|band|teil|nr\.?|#|episode)\s*(\d+)\)", None),
]
# Ohne Folgenummer (nur Serie erkennen)
SERIES_PATTERNS_SERIES_ONLY = [
(r"(?i)^(die drei \?\?\?|die drei fragezeichen|drei fragezeichen)", "Die drei ???"),
(r"(?i)^(tkkg)\b", "TKKG"),
(r"(?i)^(fünf freunde|funf freunde)", "Fünf Freunde"),
(r"(?i)^(bibi blocksberg)", "Bibi Blocksberg"),
(r"(?i)^(benjamin blümchen|benjamin blumchen)", "Benjamin Blümchen"),
(r"(?i)^(bibi und tina)", "Bibi und Tina"),
(r"(?i)^(der kleine vampir)", "Der kleine Vampir"),
]
def detect_series(title: str) -> tuple[str | None, str | None]:
for pattern, canonical_name in SERIES_PATTERNS:
m = re.match(pattern, title.strip())
t = title.strip()
for pattern, canonical_name in SERIES_PATTERNS_WITH_EPISODE:
m = re.match(pattern, t)
if m:
series = canonical_name or m.group(1).strip()
episode = m.group(2)
return series, episode
return (canonical_name or m.group(1).strip(), m.group(2))
for pattern, canonical_name in SERIES_PATTERNS_SERIES_ONLY:
m = re.match(pattern, t)
if m:
return (canonical_name or m.group(1).strip(), None)
return None, None
def _build_search_title(original: str) -> str:
"""Bereinigt Titel für Such-APIs: ??? raus, Sonderzeichen, Klammer-Suffixe."""
t = original
t = re.sub(r"\?{2,}", "", t)
t = re.sub(r"\s*\([^)]*\)\s*$", "", t)
t = re.sub(r"[_\-]+", " ", t)
t = re.sub(r"\s+", " ", t).strip()
return t
def _title_similarity(a: str, b: str) -> float:
"""Wort-Überlapp mit Min/Max-Gewichtung — lenient für Teil-Treffer."""
if not a or not b:
@@ -215,15 +240,20 @@ async def match_audiobook(item_id: str):
author = item.author
series, episode = detect_series(title)
search_title = title
if series:
search_title = f"{series} {episode}" if episode else series
if episode:
search_title = f"{series} {episode}"
else:
# Serie erkannt, keine Folgennummer → kompletten Titel suchen
search_title = _build_search_title(title)
if not item.series:
item.series = series
if not item.series_sequence and episode:
item.series_sequence = episode
else:
search_title = _build_search_title(title)
logger.info(f"Matche: '{title}' (Such-Titel: '{search_title}') | Quellen: {sources}")
logger.info(f"Matche: orig='{title}' suchTitel='{search_title}' author={author!r} | Quellen: {sources}")
best: MatchResult | None = None
best_score = 0.0
@@ -274,18 +304,23 @@ async def match_audiobook(item_id: str):
async def search_for_item(title: str, author: str | None = None) -> list[dict]:
"""Suche über alle Quellen für manuelles Matching. Gibt alle relevanten Felder zurück."""
async def _search_source(coro):
search_title = _build_search_title(title)
logger.info(f"Manuelle Suche: orig='{title}' bereinigt='{search_title}' author={author!r}")
async def _search_source(name: str, coro):
try:
return await coro
r = await coro
logger.info(f"Manuelle Suche {name}: {len(r)} Treffer")
return r
except Exception as e:
logger.warning(f"Such-Fehler: {e}")
logger.warning(f"Manuelle Suche {name} Fehler: {e}")
return []
mb, ol, gb, dnb = await asyncio.gather(
_search_source(search_musicbrainz(title, author)),
_search_source(search_open_library(title, author)),
_search_source(search_google_books(title, author)),
_search_source(search_dnb(title, author)),
_search_source("musicbrainz", search_musicbrainz(search_title, author)),
_search_source("open_library", search_open_library(search_title, author)),
_search_source("google_books", search_google_books(search_title, author)),
_search_source("dnb", search_dnb(search_title, author)),
)
results = []