Read ID3 tags during scan — fixes 'Folge 114 Die Villa der Toten' problem

Diagnosis from connectivity check: 4/5 APIs reachable (only Google Books rate-limited). So the network is fine — the search title was the problem. 'Folge 114 Die Villa der Toten' isn't indexed under that name anywhere. The MP3 itself has the real metadata in ID3 tags (album, artist, year). Scanner now reads ID3/Vorbis/MP4 tags from the first audio file: - album → item.title - albumartist / composer / artist → item.author - date → publish_year - organization / publisher → publisher - language → language - genre → genres - artist (heuristic) → series, if it doesn't appear in album title Parent folder name → series hint (skipped if it's a library root). Only fills empty fields, never overwrites manually edited or matched data. Runs on new items AND on re-scan for items without an active match. Search title normalization improved: 'Folge 123 - X' / 'Band 7: Y' etc. prefixes and infixes get stripped so APIs see the actual episode title. New endpoint POST /api/items/{id}/extract-tags + 'Tags lesen' button in BookDetail — triggers tag extraction on demand for existing items. Returns before/after diff so user can see what was filled in. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-26 20:15:44 +02:00
parent 4fccb7abae
commit 0824894a7f
5 changed files with 192 additions and 2 deletions
--- a/backend/app/services/matcher.py
+++ b/backend/app/services/matcher.py
@@ -70,10 +70,17 @@ def detect_series(title: str) -> tuple[str | None, str | None]:


 def _build_search_title(original: str) -> str:
-    """Bereinigt Titel für Such-APIs: ??? raus, Sonderzeichen, Klammer-Suffixe."""
+    """Bereinigt Titel für Such-APIs: ??? raus, Folge-N-Prefix raus, Klammer-Suffixe raus."""
    t = original
+    # ??? entfernen (CQL-Wildcard-Problem)
    t = re.sub(r"\?{2,}", "", t)
+    # "(Folge 123)" oder "(2007)" Suffixe entfernen
    t = re.sub(r"\s*\([^)]*\)\s*$", "", t)
+    # "Folge 123 -" oder "Folge 123:" oder "Folge 123 " am Anfang entfernen
+    t = re.sub(r"(?i)^\s*(?:folge|band|teil|episode|nr\.?|#)\s*\d+\s*[-:–\.]*\s*", "", t)
+    # "Folge 123" mitten im Titel reduzieren auf nichts
+    t = re.sub(r"(?i)\b(?:folge|band|teil|episode|nr\.?|#)\s*\d+\b\s*[-:–\.]*\s*", " ", t)
+    # Bindestriche/Unterstriche
    t = re.sub(r"[_\-–]+", " ", t)
    t = re.sub(r"\s+", " ", t).strip()
    return t
--- a/backend/app/services/scanner.py
+++ b/backend/app/services/scanner.py
@@ -31,6 +31,105 @@ def _get_audio_duration(file_path: str) -> float:
    return 0.0


+def _extract_audio_tags(file_path: str) -> dict:
+    """Liest ID3/Vorbis/MP4-Tags via mutagen easy-API."""
+    try:
+        from mutagen import File as MutagenFile
+        audio = MutagenFile(file_path, easy=True)
+        if not audio:
+            return {}
+
+        def first(key: str):
+            v = audio.get(key)
+            if not v:
+                return None
+            if isinstance(v, list):
+                return v[0] if v else None
+            return v
+
+        result = {
+            "album": first("album"),
+            "title": first("title"),
+            "artist": first("artist"),
+            "albumartist": first("albumartist"),
+            "composer": first("composer"),
+            "date": first("date"),
+            "publisher": first("organization") or first("publisher"),
+            "language": first("language"),
+            "discnumber": first("discnumber"),
+            "tracknumber": first("tracknumber"),
+        }
+        genre = audio.get("genre")
+        if genre:
+            result["genres"] = genre if isinstance(genre, list) else [genre]
+        return {k: v for k, v in result.items() if v}
+    except Exception as e:
+        logger.debug(f"Tag-Lesen fehlgeschlagen für {file_path}: {e}")
+        return {}
+
+
+def _series_from_parent(folder_path: str, library_folders: list) -> str | None:
+    """Wenn der Parent-Ordner nicht selbst eine Library-Root ist, ist er möglicherweise die Serie."""
+    import re as _re
+    parent_path = os.path.dirname(folder_path)
+    parent = os.path.basename(parent_path)
+    if not parent:
+        return None
+    # Skip wenn Parent eine Library-Root ist
+    for lib_folder in library_folders:
+        lib_path = lib_folder.get("fullPath", lib_folder.get("full_path", ""))
+        if lib_path and os.path.normpath(parent_path) == os.path.normpath(lib_path):
+            return None
+    if 2 < len(parent) < 60 and not _re.match(r"^[\d\W]+$", parent):
+        return parent
+    return None
+
+
+def _apply_tags_to_item(item, tags: dict, parent_series_hint: str | None):
+    """Befüllt leere Felder aus ID3-Tags. Bestehende Werte werden NICHT überschrieben."""
+    import re as _re
+
+    album = tags.get("album")
+    artist = tags.get("albumartist") or tags.get("artist")
+    composer = tags.get("composer")
+
+    # Title: Album ist normalerweise der Hörbuch-Titel
+    folder_title = _guess_title_from_path(item.path)
+    if album and (not item.title or item.title == folder_title):
+        item.title = album
+
+    # Author: AlbumArtist > Composer > Artist
+    if not item.author:
+        if composer:
+            item.author = composer
+        elif artist:
+            item.author = artist
+
+    if not item.publisher and tags.get("publisher"):
+        item.publisher = tags["publisher"]
+
+    if not item.publish_year and tags.get("date"):
+        m = _re.search(r"\d{4}", str(tags["date"]))
+        if m:
+            item.publish_year = int(m.group())
+
+    if not item.language and tags.get("language"):
+        item.language = tags["language"]
+
+    if not item.genres and tags.get("genres"):
+        item.genres = tags["genres"]
+
+    # Serie aus tracknumber/discnumber wäre möglich aber unzuverlässig.
+    # Stattdessen: Parent-Ordner als Serien-Hinweis nehmen.
+    if not item.series and parent_series_hint:
+        item.series = parent_series_hint
+    # Bei "Die drei ???" Hörspielen: artist ist meist die Serie selbst
+    if not item.series and artist and len(artist) < 40:
+        # Heuristik: wenn artist und album sich nicht ähneln, könnte artist die Serie sein
+        if album and artist.lower() not in album.lower():
+            item.series = artist
+
+
 def _get_file_size(file_path: str) -> int:
    try:
        return os.path.getsize(file_path)
@@ -220,6 +319,11 @@ async def scan_library_task(library_id: str, job_id: str):
                total_duration = sum(_get_audio_duration(f) for f in audio_files)
                total_size = sum(_get_file_size(f) for f in audio_files)

+                # ID3-Tags aus erster Audio-Datei lesen
+                first_audio = audio_files[0] if audio_files else None
+                tags = _extract_audio_tags(first_audio) if first_audio else {}
+                parent_series = _series_from_parent(folder_path, folders)
+
                if existing_item:
                    existing_item.duration_seconds = total_duration
                    existing_item.size_bytes = total_size
@@ -227,6 +331,11 @@ async def scan_library_task(library_id: str, job_id: str):
                    existing_item.is_missing = False
                    existing_item.updated_at = datetime.utcnow()
                    item = existing_item
+                    # Tags nachziehen wenn kein Match aktiv ist
+                    if not existing_item.match_locked and (
+                        not existing_item.matched_source or existing_item.matched_source == "none"
+                    ):
+                        _apply_tags_to_item(item, tags, parent_series)
                    # Cover aus Ordner/Embed nachziehen falls noch keins da ist
                    if not item.cover_path or not os.path.exists(item.cover_path or ""):
                        local_cover = _save_local_cover(folder_path, audio_files, item.id)
@@ -251,6 +360,13 @@ async def scan_library_task(library_id: str, job_id: str):
                    )
                    db.add(item)
                    await db.flush()
+                    # Tags anwenden
+                    _apply_tags_to_item(item, tags, parent_series)
+                    logger.info(
+                        f"Neu gescannt: id={item.id} title={item.title!r} "
+                        f"author={item.author!r} series={item.series!r} "
+                        f"year={item.publish_year} tags={list(tags.keys())}"
+                    )

                    # BookFiles anlegen
                    for idx, file_path in enumerate(audio_files):