Files
Audiolib/backend/app/routers/matching.py
Audiolib 0824894a7f Read ID3 tags during scan — fixes 'Folge 114 Die Villa der Toten' problem
Diagnosis from connectivity check: 4/5 APIs reachable (only Google Books
rate-limited). So the network is fine — the search title was the problem.
'Folge 114 Die Villa der Toten' isn't indexed under that name anywhere.
The MP3 itself has the real metadata in ID3 tags (album, artist, year).

Scanner now reads ID3/Vorbis/MP4 tags from the first audio file:
- album → item.title
- albumartist / composer / artist → item.author
- date → publish_year
- organization / publisher → publisher
- language → language
- genre → genres
- artist (heuristic) → series, if it doesn't appear in album title

Parent folder name → series hint (skipped if it's a library root).

Only fills empty fields, never overwrites manually edited or matched data.
Runs on new items AND on re-scan for items without an active match.

Search title normalization improved: 'Folge 123 - X' / 'Band 7: Y' etc.
prefixes and infixes get stripped so APIs see the actual episode title.

New endpoint POST /api/items/{id}/extract-tags + 'Tags lesen' button in
BookDetail — triggers tag extraction on demand for existing items.
Returns before/after diff so user can see what was filled in.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-26 20:15:44 +02:00

330 lines
12 KiB
Python

import asyncio
import logging
from fastapi import APIRouter, Depends, HTTPException, BackgroundTasks
from sqlalchemy.ext.asyncio import AsyncSession
from sqlalchemy import select
from ..dependencies import get_db, get_current_user, require_admin
from ..models.user import User
from ..models.media_item import LibraryItem
from ..services.matcher import match_audiobook, search_for_item, _apply_match, _enrich_match
from ..services.matching.musicbrainz import get_release_details
from ..services.matching.open_library import get_work_details
from ..services.matching.base import MatchResult
from datetime import datetime
logger = logging.getLogger(__name__)
router = APIRouter(prefix="/api/items", tags=["matching"])
@router.post("/{item_id}/match")
async def trigger_match(
item_id: str,
background_tasks: BackgroundTasks,
current_user: User = Depends(get_current_user),
db: AsyncSession = Depends(get_db),
):
result = await db.execute(select(LibraryItem).where(LibraryItem.id == item_id))
item = result.scalar_one_or_none()
if not item:
raise HTTPException(status_code=404, detail="Item not found")
background_tasks.add_task(match_audiobook, item_id)
return {"message": "Matching gestartet", "itemId": item_id}
@router.get("/{item_id}/match/search")
async def search_match(
item_id: str,
q: str | None = None,
current_user: User = Depends(get_current_user),
db: AsyncSession = Depends(get_db),
):
result = await db.execute(select(LibraryItem).where(LibraryItem.id == item_id))
item = result.scalar_one_or_none()
if not item:
raise HTTPException(status_code=404, detail="Item not found")
query = q or item.title or ""
author = item.author if not q else None
results = await search_for_item(query, author)
return {"results": results}
@router.get("/match/connectivity")
async def check_connectivity(
current_user: User = Depends(get_current_user),
):
"""Testet ob das Backend die externen Metadaten-APIs erreichen kann."""
import httpx
import time
targets = [
("Google", "https://www.google.com"),
("MusicBrainz", "https://musicbrainz.org/ws/2/release?query=test&fmt=json&limit=1"),
("OpenLibrary", "https://openlibrary.org/search.json?title=test&limit=1"),
("GoogleBooks", "https://www.googleapis.com/books/v1/volumes?q=test&maxResults=1"),
("DNB", "https://services.dnb.de/sru/dnb?version=1.1&operation=searchRetrieve&query=tit%3Dtest&maximumRecords=1"),
]
headers = {"User-Agent": "audiolib/1.0"}
results = []
async with httpx.AsyncClient(headers=headers, timeout=15, follow_redirects=True) as client:
for name, url in targets:
t0 = time.time()
try:
r = await client.get(url)
results.append({
"name": name,
"url": url,
"ok": True,
"status": r.status_code,
"bytes": len(r.content),
"ms": int((time.time() - t0) * 1000),
"body_snippet": (r.text[:150] if r.status_code != 200 else None),
})
except Exception as e:
results.append({
"name": name,
"url": url,
"ok": False,
"error": f"{type(e).__name__}: {e}",
"ms": int((time.time() - t0) * 1000),
})
# Auch Env-Variablen die httpx beeinflussen
import os
proxy_env = {
k: v for k, v in os.environ.items()
if k.upper() in ("HTTP_PROXY", "HTTPS_PROXY", "NO_PROXY", "ALL_PROXY")
}
return {
"results": results,
"proxy_env": proxy_env or "keine",
}
@router.get("/match/debug")
async def debug_match(
title: str,
author: str | None = None,
current_user: User = Depends(get_current_user),
):
"""Debug-Endpoint: gibt rohe Ergebnisse aller Such-Quellen zurück.
Aufruf direkt aus Browser: /api/items/match/debug?title=Foo&author=Bar
"""
from ..services.matching.musicbrainz import search_musicbrainz
from ..services.matching.open_library import search_open_library
from ..services.matching.google_books import search_google_books
from ..services.matching.dnb import search_dnb
from ..services.matcher import _build_search_title, detect_series
series, episode = detect_series(title)
search_title = _build_search_title(title)
if series and episode:
search_title = f"{series} {episode}"
logger.info(f"DEBUG: title={title!r} → search={search_title!r} series={series!r} episode={episode!r}")
async def _try(name, coro):
try:
r = await coro
return {
"source": name,
"ok": True,
"count": len(r),
"results": [
{
"title": x.title, "author": x.author, "narrator": x.narrator,
"publisher": x.publisher, "year": x.publish_year,
"series": x.series, "series_sequence": x.series_sequence,
"cover_url": x.cover_url, "language": x.language,
"genres": x.genres, "description": (x.description or "")[:200],
"confidence": x.confidence, "source_id": x.source_id,
} for x in r
],
}
except Exception as e:
return {"source": name, "ok": False, "error": f"{type(e).__name__}: {e}"}
results = await asyncio.gather(
_try("musicbrainz", search_musicbrainz(search_title, author)),
_try("open_library", search_open_library(search_title, author)),
_try("google_books", search_google_books(search_title, author)),
_try("dnb", search_dnb(search_title, author)),
)
return {
"input": {"title": title, "author": author},
"normalized": {"search_title": search_title, "series": series, "episode": episode},
"sources": results,
}
@router.post("/{item_id}/match/apply")
async def apply_match(
item_id: str,
body: dict,
current_user: User = Depends(get_current_user),
db: AsyncSession = Depends(get_db),
):
"""
Wendet einen manuell gewählten Match-Treffer an.
body: { source, id, title, author, narrator, description, publisher, publishYear, series, seriesSequence, language, genres, cover, ... }
"""
result = await db.execute(select(LibraryItem).where(LibraryItem.id == item_id))
item = result.scalar_one_or_none()
if not item:
raise HTTPException(status_code=404, detail="Item not found")
source = body.get("source", "manual")
source_id = body.get("id", "")
logger.info(
f"Manual apply: item={item_id} source={source} source_id={source_id} "
f"body_keys={sorted(body.keys())}"
)
# Immer aus body konstruieren (search_for_item liefert jetzt alle Felder)
match_result = MatchResult(
source=source,
source_id=source_id,
title=body.get("title") or item.title or "",
subtitle=body.get("subtitle"),
author=body.get("author"),
narrator=body.get("narrator"),
description=body.get("description"),
publisher=body.get("publisher"),
publish_year=body.get("publishYear"),
series=body.get("series"),
series_sequence=body.get("seriesSequence"),
language=body.get("language"),
genres=body.get("genres") or [],
cover_url=body.get("cover"),
confidence=1.0,
)
# Mit Details anreichern (Beschreibung, Kapitel) — überschreibt keine vorhandenen Werte
try:
if source == "musicbrainz":
details = await get_release_details(source_id)
if details:
_enrich_match(match_result, details)
elif source == "open_library":
details = await get_work_details(source_id)
if details:
_enrich_match(match_result, details)
except Exception as e:
logger.warning(f"Details-Laden fehlgeschlagen ({source}: {source_id}): {e}")
match_result.confidence = 1.0
await _apply_match(db, item, match_result, confidence=1.0)
item.match_locked = True
item.updated_at = datetime.utcnow()
await db.commit()
await db.refresh(item)
from ..routers.items import _enrich_item_with_files
return await _enrich_item_with_files(item, db)
@router.post("/{item_id}/extract-tags")
async def extract_audio_tags(
item_id: str,
current_user: User = Depends(get_current_user),
db: AsyncSession = Depends(get_db),
):
"""Liest ID3-Tags aus der ersten Audio-Datei und füllt leere Metadaten."""
from ..services.scanner import _extract_audio_tags, _apply_tags_to_item, _series_from_parent
from ..models.media_item import BookFile
from ..models.library import Library
result = await db.execute(select(LibraryItem).where(LibraryItem.id == item_id))
item = result.scalar_one_or_none()
if not item:
raise HTTPException(status_code=404, detail="Item not found")
lib_result = await db.execute(select(Library).where(Library.id == item.library_id))
lib = lib_result.scalar_one_or_none()
library_folders = lib.folders if lib else []
files_result = await db.execute(
select(BookFile).where(BookFile.library_item_id == item_id).order_by(BookFile.track_index)
)
files = files_result.scalars().all()
if not files:
return {"success": False, "message": "Keine Audio-Dateien"}
tags = _extract_audio_tags(files[0].path)
parent_series = _series_from_parent(item.path, library_folders)
before = {
"title": item.title, "author": item.author, "publisher": item.publisher,
"publish_year": item.publish_year, "series": item.series, "genres": item.genres,
}
_apply_tags_to_item(item, tags, parent_series)
item.updated_at = datetime.utcnow()
await db.commit()
after = {
"title": item.title, "author": item.author, "publisher": item.publisher,
"publish_year": item.publish_year, "series": item.series, "genres": item.genres,
}
logger.info(f"Tags extrahiert für {item_id}: tags={list(tags.keys())} before={before} after={after}")
return {"success": True, "tags": tags, "before": before, "after": after}
@router.post("/{item_id}/extract-cover")
async def extract_local_cover(
item_id: str,
current_user: User = Depends(get_current_user),
db: AsyncSession = Depends(get_db),
):
"""Extrahiert ein Cover aus Ordner-Dateien oder eingebettetem Artwork."""
from ..services.scanner import _save_local_cover
from ..models.media_item import BookFile
import os
result = await db.execute(select(LibraryItem).where(LibraryItem.id == item_id))
item = result.scalar_one_or_none()
if not item:
raise HTTPException(status_code=404, detail="Item not found")
files_result = await db.execute(
select(BookFile).where(BookFile.library_item_id == item_id).order_by(BookFile.track_index)
)
audio_files = [f.path for f in files_result.scalars().all()]
cover = _save_local_cover(item.path, audio_files, item.id)
if cover:
item.cover_path = cover
item.updated_at = datetime.utcnow()
await db.commit()
logger.info(f"Lokales Cover gesetzt für {item_id}: {cover}")
return {"success": True, "cover_path": cover}
return {"success": False, "message": "Kein Cover gefunden"}
@router.delete("/{item_id}/match")
async def clear_match(
item_id: str,
current_user: User = Depends(get_current_user),
db: AsyncSession = Depends(get_db),
):
result = await db.execute(select(LibraryItem).where(LibraryItem.id == item_id))
item = result.scalar_one_or_none()
if not item:
raise HTTPException(status_code=404, detail="Item not found")
item.matched_source = "none"
item.matched_id = None
item.match_confidence = 0.0
item.match_locked = False
tags = item.tags or []
if "zu_prüfen" not in tags:
tags.append("zu_prüfen")
item.tags = tags
item.updated_at = datetime.utcnow()
await db.commit()
return {"success": True}