Make matching debuggable + fix metadata search blockers
DNB rewrite: - Multiple query strategies with fallback (title+author+mat=ton → title+author → title+mat=ton → title-only → fulltext). Returns on first hit. Most German audiobooks aren't tagged mat=ton in DNB, which was killing all searches. - Strip CQL wildcard chars (?, *, <, >, =, /, quotes) from search terms. The "???" in "Die drei ???" was breaking the CQL parser. - Log HTTP status, body snippet on non-200, and numberOfRecords on every query so log shows exactly what DNB returned. - Parse SRU diagnostic elements (DNB error messages buried in XML). - Convert author/narrator from "Lastname, Firstname" to "Firstname Lastname" for consistency with other sources. Matcher: - Split series patterns: WITH_EPISODE (need digit) and SERIES_ONLY (just the series name). "Die drei ??? und der Fluch des Rubins" now properly detects "Die drei ???" as series even without folge#. - New _build_search_title: removes ??? sequences, trailing parens, collapses whitespace, before sending to APIs. - Manual search also passes through normalization. Logs source + hit count per query. Debug endpoint: - GET /api/items/match/debug?title=...&author=... returns raw results from all 4 sources with status, error messages, and full metadata. - "Debug" button added in BookDetail — shows what each API actually returns inline, so the user can see if it's a search problem, parse problem, or threshold problem. - "Cover aus Datei" button — triggers local cover extraction (folder.jpg or embedded artwork) on demand. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -31,7 +31,8 @@ logger = logging.getLogger(__name__)
|
||||
AUTO_ACCEPT_THRESHOLD = 0.65
|
||||
UNCERTAIN_THRESHOLD = 0.40
|
||||
|
||||
SERIES_PATTERNS = [
|
||||
# Mit Folgenummer
|
||||
SERIES_PATTERNS_WITH_EPISODE = [
|
||||
(r"(?i)^(die drei \?\?\?|die drei fragezeichen|drei fragezeichen)\s*[-–]?\s*(?:folge\s*)?(\d+)", "Die drei ???"),
|
||||
(r"(?i)^(tkkg)\s*[-–]?\s*(?:folge\s*)?(\d+)", "TKKG"),
|
||||
(r"(?i)^(fünf freunde|funf freunde)\s*[-–]?\s*(?:band\s*)?(\d+)", "Fünf Freunde"),
|
||||
@@ -43,17 +44,41 @@ SERIES_PATTERNS = [
|
||||
(r"(?i)^(.+?)\s*\((?:folge|band|teil|nr\.?|#|episode)\s*(\d+)\)", None),
|
||||
]
|
||||
|
||||
# Ohne Folgenummer (nur Serie erkennen)
|
||||
SERIES_PATTERNS_SERIES_ONLY = [
|
||||
(r"(?i)^(die drei \?\?\?|die drei fragezeichen|drei fragezeichen)", "Die drei ???"),
|
||||
(r"(?i)^(tkkg)\b", "TKKG"),
|
||||
(r"(?i)^(fünf freunde|funf freunde)", "Fünf Freunde"),
|
||||
(r"(?i)^(bibi blocksberg)", "Bibi Blocksberg"),
|
||||
(r"(?i)^(benjamin blümchen|benjamin blumchen)", "Benjamin Blümchen"),
|
||||
(r"(?i)^(bibi und tina)", "Bibi und Tina"),
|
||||
(r"(?i)^(der kleine vampir)", "Der kleine Vampir"),
|
||||
]
|
||||
|
||||
|
||||
def detect_series(title: str) -> tuple[str | None, str | None]:
|
||||
for pattern, canonical_name in SERIES_PATTERNS:
|
||||
m = re.match(pattern, title.strip())
|
||||
t = title.strip()
|
||||
for pattern, canonical_name in SERIES_PATTERNS_WITH_EPISODE:
|
||||
m = re.match(pattern, t)
|
||||
if m:
|
||||
series = canonical_name or m.group(1).strip()
|
||||
episode = m.group(2)
|
||||
return series, episode
|
||||
return (canonical_name or m.group(1).strip(), m.group(2))
|
||||
for pattern, canonical_name in SERIES_PATTERNS_SERIES_ONLY:
|
||||
m = re.match(pattern, t)
|
||||
if m:
|
||||
return (canonical_name or m.group(1).strip(), None)
|
||||
return None, None
|
||||
|
||||
|
||||
def _build_search_title(original: str) -> str:
|
||||
"""Bereinigt Titel für Such-APIs: ??? raus, Sonderzeichen, Klammer-Suffixe."""
|
||||
t = original
|
||||
t = re.sub(r"\?{2,}", "", t)
|
||||
t = re.sub(r"\s*\([^)]*\)\s*$", "", t)
|
||||
t = re.sub(r"[_\-–]+", " ", t)
|
||||
t = re.sub(r"\s+", " ", t).strip()
|
||||
return t
|
||||
|
||||
|
||||
def _title_similarity(a: str, b: str) -> float:
|
||||
"""Wort-Überlapp mit Min/Max-Gewichtung — lenient für Teil-Treffer."""
|
||||
if not a or not b:
|
||||
@@ -215,15 +240,20 @@ async def match_audiobook(item_id: str):
|
||||
author = item.author
|
||||
|
||||
series, episode = detect_series(title)
|
||||
search_title = title
|
||||
if series:
|
||||
search_title = f"{series} {episode}" if episode else series
|
||||
if episode:
|
||||
search_title = f"{series} {episode}"
|
||||
else:
|
||||
# Serie erkannt, keine Folgennummer → kompletten Titel suchen
|
||||
search_title = _build_search_title(title)
|
||||
if not item.series:
|
||||
item.series = series
|
||||
if not item.series_sequence and episode:
|
||||
item.series_sequence = episode
|
||||
else:
|
||||
search_title = _build_search_title(title)
|
||||
|
||||
logger.info(f"Matche: '{title}' (Such-Titel: '{search_title}') | Quellen: {sources}")
|
||||
logger.info(f"Matche: orig='{title}' suchTitel='{search_title}' author={author!r} | Quellen: {sources}")
|
||||
|
||||
best: MatchResult | None = None
|
||||
best_score = 0.0
|
||||
@@ -274,18 +304,23 @@ async def match_audiobook(item_id: str):
|
||||
|
||||
async def search_for_item(title: str, author: str | None = None) -> list[dict]:
|
||||
"""Suche über alle Quellen – für manuelles Matching. Gibt alle relevanten Felder zurück."""
|
||||
async def _search_source(coro):
|
||||
search_title = _build_search_title(title)
|
||||
logger.info(f"Manuelle Suche: orig='{title}' bereinigt='{search_title}' author={author!r}")
|
||||
|
||||
async def _search_source(name: str, coro):
|
||||
try:
|
||||
return await coro
|
||||
r = await coro
|
||||
logger.info(f"Manuelle Suche {name}: {len(r)} Treffer")
|
||||
return r
|
||||
except Exception as e:
|
||||
logger.warning(f"Such-Fehler: {e}")
|
||||
logger.warning(f"Manuelle Suche {name} Fehler: {e}")
|
||||
return []
|
||||
|
||||
mb, ol, gb, dnb = await asyncio.gather(
|
||||
_search_source(search_musicbrainz(title, author)),
|
||||
_search_source(search_open_library(title, author)),
|
||||
_search_source(search_google_books(title, author)),
|
||||
_search_source(search_dnb(title, author)),
|
||||
_search_source("musicbrainz", search_musicbrainz(search_title, author)),
|
||||
_search_source("open_library", search_open_library(search_title, author)),
|
||||
_search_source("google_books", search_google_books(search_title, author)),
|
||||
_search_source("dnb", search_dnb(search_title, author)),
|
||||
)
|
||||
|
||||
results = []
|
||||
|
||||
Reference in New Issue
Block a user