added file and link create

2026-02-24 13:41:39 +01:00
parent 005059be84
commit ebafaef335
10 changed files with 3208 additions and 6 deletions
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,7 +1,7 @@
 # Core Dependencies
 python-dotenv==1.0.0
-pydantic==2.5.0
-pydantic-settings==2.1.0
+pydantic==2.7.4
+pydantic-settings==2.3.0

 # AI & APIs
 openai==1.54.0
@@ -41,5 +41,11 @@ uvicorn==0.32.0
 jinja2==3.1.4
 python-multipart==0.0.9

+# Link Extraction
+trafilatura==1.7.0
+youtube-transcript-api==0.6.2
+pypdf==4.2.0
+docling==2.74.0
+
 # Teams Bot JWT validation
 PyJWT>=2.8.0
--- a/src/agents/link_topic_builder.py
+++ b/src/agents/link_topic_builder.py
@@ -0,0 +1,95 @@
+"""Agent to build a structured topic from extracted link content."""
+import json
+from typing import Any, Dict
+
+from loguru import logger
+
+from src.agents.base import BaseAgent
+
+
+class LinkTopicBuilderAgent(BaseAgent):
+    """Build a structured topic dictionary from link content."""
+
+    def __init__(self) -> None:
+        super().__init__("link_topic_builder")
+
+    async def process(self, source: Dict[str, Any]) -> Dict[str, Any]:
+        content = source.get("text", "")
+        source_url = source.get("source_url", "")
+        source_type = source.get("source_type", "web")
+        source_title = source.get("title", "")
+
+        if not content or len(content.strip()) < 200:
+            raise ValueError("Zu wenig Inhalt, um ein sinnvolles Topic zu erstellen.")
+
+        # Keep content length reasonable for the model
+        max_chars = 12000
+        if len(content) > max_chars:
+            content = content[:max_chars]
+
+        system_prompt = """Du bist ein Assistent, der aus einem Link-Inhalt ein strukturiertes Topic für einen LinkedIn-Post erstellt.
+
+Gib NUR valides JSON zurück (keine Erklärungen)."""
+
+        user_prompt = f"""Quelle:
+- Typ: {source_type}
+- Titel: {source_title}
+- URL: {source_url}
+
+Inhalt:
+\"\"\"{content}\"\"\"
+
+        Erstelle ein Topic im folgenden JSON-Format:
+{{
+  "title": "Kurzer, präziser Titel (max 80 Zeichen)",
+  "summary": "Detaillierte, vollumfängliche Zusammenfassung (8-12 Sätze, alle Hauptpunkte abdecken)",
+  "extended_summary": "Längere Zusammenfassung (150-250 Wörter) mit Kontext, Kernaussagen, Belegen und Implikationen",
+  "outline": ["Abschnitt 1: ...", "Abschnitt 2: ...", "..."],
+  "fact": "1-2 Sätze als Kernaussage/Fakt",
+  "key_points": ["Punkt 1", "Punkt 2", "..."],
+  "key_facts": ["Konkreter Fakt 1", "Konkreter Fakt 2", "..."],
+  "quotes": ["Kurzes Zitat falls sinnvoll", "..."],
+  "relevance": "Warum ist das für die Zielgruppe relevant? (1 Satz)",
+  "category": "Link",
+  "source": "{source_url}",
+  "source_title": "{source_title}",
+  "source_type": "{source_type}"
+}}
+
+Regeln:
+- Halte dich an das JSON-Format.
+- Keine erfundenen Fakten: nutze NUR den gegebenen Inhalt.
+- Wenn keine Zitate vorhanden sind, gib ein leeres Array zurück.
+- Summary und extended_summary müssen unterschiedliche Detailtiefe haben.
+"""
+
+        result = await self.call_openai(
+            system_prompt=system_prompt,
+            user_prompt=user_prompt,
+            model="gpt-4o",
+            temperature=0.2,
+            response_format={"type": "json_object"}
+        )
+
+        try:
+            topic = json.loads(result)
+        except Exception as exc:
+            logger.warning(f"Failed to parse topic JSON: {exc}")
+            raise ValueError("Antwort konnte nicht verarbeitet werden.")
+
+        # Ensure required fields exist
+        topic.setdefault("category", "Link")
+        topic.setdefault("source", source_url)
+        topic.setdefault("source_title", source_title)
+        topic.setdefault("source_type", source_type)
+        topic.setdefault("title", source_title or "Link-Thema")
+        topic.setdefault("summary", "")
+        topic.setdefault("extended_summary", "")
+        topic.setdefault("outline", [])
+        topic.setdefault("fact", topic.get("summary", "")[:300])
+        topic.setdefault("key_points", [])
+        topic.setdefault("key_facts", [])
+        topic.setdefault("quotes", [])
+        topic.setdefault("relevance", "Relevantes Thema für die Zielgruppe")
+
+        return topic
--- a/src/agents/writer.py
+++ b/src/agents/writer.py
@@ -351,6 +351,35 @@ class WriterAgent(BaseAgent):
        if key_facts and isinstance(key_facts, list) and len(key_facts) > 0:
            facts_section = "\n**KEY FACTS (nutze diese!):**\n" + "\n".join([f"- {f}" for f in key_facts]) + "\n"

+        summary_section = ""
+        if topic.get('summary'):
+            summary_section = f"\n**ZUSAMMENFASSUNG (vom Link):**\n{topic.get('summary')}\n"
+
+        extended_summary_section = ""
+        if topic.get('extended_summary'):
+            extended_summary_section = f"\n**DETAILLIERTE ZUSAMMENFASSUNG:**\n{topic.get('extended_summary')}\n"
+
+        outline_section = ""
+        outline = topic.get('outline', [])
+        if outline and isinstance(outline, list) and len(outline) > 0:
+            outline_section = "\n**INHALTS-GLIEDERUNG:**\n" + "\n".join([f"- {o}" for o in outline]) + "\n"
+
+        key_points_section = ""
+        key_points = topic.get('key_points', [])
+        if key_points and isinstance(key_points, list) and len(key_points) > 0:
+            key_points_section = "\n**KERNPUNKTE (vom Link):**\n" + "\n".join([f"- {p}" for p in key_points]) + "\n"
+
+        quotes_section = ""
+        quotes = topic.get('quotes', [])
+        if quotes and isinstance(quotes, list) and len(quotes) > 0:
+            quotes_section = "\n**ZITATE (optional verwenden):**\n" + "\n".join([f"- \"{q}\"" for q in quotes]) + "\n"
+
+        source_section = ""
+        if topic.get('source_title') or topic.get('source'):
+            source_title = topic.get('source_title') or ""
+            source_url = topic.get('source') or ""
+            source_section = f"\n**QUELLE:** {source_title} {source_url}\n"
+
        why_section = ""
        if topic.get('why_this_person'):
            why_section = f"\n**WARUM DU DARÜBER SCHREIBEN SOLLTEST:**\n{topic.get('why_this_person')}\n"
@@ -391,7 +420,7 @@ Schreibe einen authentischen LinkedIn-Post, der:
 {angle_section}{hook_section}
 **KERN-FAKT / INHALT:**
 {topic.get('fact', topic.get('description', ''))}
-{facts_section}{thoughts_section}
+{summary_section}{extended_summary_section}{outline_section}{key_points_section}{facts_section}{quotes_section}{source_section}{thoughts_section}
 **WARUM RELEVANT:**
 {topic.get('relevance', 'Aktuelles Thema für die Zielgruppe')}
 {why_section}
@@ -941,6 +970,35 @@ Gib NUR den überarbeiteten Post zurück - keine Kommentare."""
            if key_facts and isinstance(key_facts, list) and len(key_facts) > 0:
                facts_section = "\n**KEY FACTS (nutze diese!):**\n" + "\n".join([f"- {f}" for f in key_facts]) + "\n"

+            summary_section = ""
+            if topic.get('summary'):
+                summary_section = f"\n**ZUSAMMENFASSUNG (vom Link):**\n{topic.get('summary')}\n"
+
+            extended_summary_section = ""
+            if topic.get('extended_summary'):
+                extended_summary_section = f"\n**DETAILLIERTE ZUSAMMENFASSUNG:**\n{topic.get('extended_summary')}\n"
+
+            outline_section = ""
+            outline = topic.get('outline', [])
+            if outline and isinstance(outline, list) and len(outline) > 0:
+                outline_section = "\n**INHALTS-GLIEDERUNG:**\n" + "\n".join([f"- {o}" for o in outline]) + "\n"
+
+            key_points_section = ""
+            key_points = topic.get('key_points', [])
+            if key_points and isinstance(key_points, list) and len(key_points) > 0:
+                key_points_section = "\n**KERNPUNKTE (vom Link):**\n" + "\n".join([f"- {p}" for p in key_points]) + "\n"
+
+            quotes_section = ""
+            quotes = topic.get('quotes', [])
+            if quotes and isinstance(quotes, list) and len(quotes) > 0:
+                quotes_section = "\n**ZITATE (optional verwenden):**\n" + "\n".join([f"- \"{q}\"" for q in quotes]) + "\n"
+
+            source_section = ""
+            if topic.get('source_title') or topic.get('source'):
+                source_title = topic.get('source_title') or ""
+                source_url = topic.get('source') or ""
+                source_section = f"\n**QUELLE:** {source_title} {source_url}\n"
+
            # User thoughts section
            thoughts_section = ""
            if user_thoughts:
@@ -977,7 +1035,7 @@ Schreibe einen authentischen LinkedIn-Post, der:
 {angle_section}{hook_section}
 **KERN-FAKT / INHALT:**
 {topic.get('fact', topic.get('description', ''))}
-{facts_section}{thoughts_section}
+{summary_section}{extended_summary_section}{outline_section}{key_points_section}{facts_section}{quotes_section}{source_section}{thoughts_section}
 **WARUM RELEVANT:**
 {topic.get('relevance', 'Aktuelles Thema für die Zielgruppe')}

@@ -1063,6 +1121,16 @@ Antworte im JSON-Format:
            if post_type.description:
                post_type_section += f"\n{post_type.description}"

+        content_block = topic.get('fact', topic.get('description', 'Keine Details verfügbar'))
+        if topic.get('summary'):
+            content_block += f"\n\nZUSAMMENFASSUNG:\n{topic.get('summary')}"
+        if topic.get('extended_summary'):
+            content_block += f"\n\nDETAILLIERTE ZUSAMMENFASSUNG:\n{topic.get('extended_summary')}"
+        if topic.get('outline') and isinstance(topic.get('outline'), list):
+            content_block += "\n\nGLIEDERUNG:\n" + "\n".join([f"- {o}" for o in topic.get('outline', [])])
+        if topic.get('key_points') and isinstance(topic.get('key_points'), list):
+            content_block += "\n\nKERNPUNKTE:\n" + "\n".join([f"- {p}" for p in topic.get('key_points', [])])
+
        user_prompt = f"""Generiere 4 Hooks für dieses Thema:

 THEMA: {topic.get('title', 'Unbekanntes Thema')}
@@ -1070,7 +1138,7 @@ THEMA: {topic.get('title', 'Unbekanntes Thema')}
 KATEGORIE: {topic.get('category', 'Allgemein')}

 KERN-FAKT/INHALT:
-{topic.get('fact', topic.get('description', 'Keine Details verfügbar'))}
+{content_block}
 {thoughts_section}{post_type_section}

 Generiere jetzt die 4 verschiedenen Hooks im JSON-Format."""
--- a/src/config.py
+++ b/src/config.py
@@ -69,6 +69,10 @@ class Settings(BaseSettings):
    redis_url: str = "redis://redis:6379/0"
    scheduler_enabled: bool = False  # True only on dedicated scheduler container

+    # YouTube (optional)
+    youtube_cookies: str = ""  # Raw Cookie header value for transcript fetching
+    transcriptapi_key: str = ""  # TranscriptAPI.com key
+
    # Telegram Bot (experimental)
    telegram_enabled: bool = False
    telegram_bot_token: str = ""
--- a/src/services/file_extractor.py
+++ b/src/services/file_extractor.py
@@ -0,0 +1,54 @@
+"""Extract text content from uploaded files using Docling."""
+import os
+import tempfile
+from typing import Optional
+
+from docling.document_converter import DocumentConverter, PdfFormatOption
+from docling.datamodel.base_models import InputFormat
+from docling.datamodel.pipeline_options import PdfPipelineOptions
+
+
+class FileExtractionError(RuntimeError):
+    """Raised when file extraction fails."""
+
+
+class FileExtractor:
+    """Extract text from files via Docling."""
+
+    def __init__(self) -> None:
+        pdf_options = PdfPipelineOptions(do_ocr=False)
+        self._converter = DocumentConverter(
+            format_options={
+                InputFormat.PDF: PdfFormatOption(pipeline_options=pdf_options)
+            }
+        )
+
+    def extract_text(self, file_bytes: bytes, filename: str) -> str:
+        if not file_bytes:
+            raise FileExtractionError("Leere Datei.")
+
+        suffix = ""
+        if filename and "." in filename:
+            suffix = os.path.splitext(filename)[1]
+
+        try:
+            with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
+                tmp.write(file_bytes)
+                tmp_path = tmp.name
+
+            result = self._converter.convert(tmp_path)
+            document = result.document
+            text = document.export_to_markdown().strip()
+            if not text:
+                raise FileExtractionError("Keine Inhalte im Dokument gefunden.")
+            return text
+        except FileExtractionError:
+            raise
+        except Exception as exc:
+            raise FileExtractionError(f"Datei konnte nicht verarbeitet werden: {exc}") from exc
+        finally:
+            try:
+                if "tmp_path" in locals():
+                    os.unlink(tmp_path)
+            except Exception:
+                pass
--- a/src/services/link_extractor.py
+++ b/src/services/link_extractor.py
@@ -0,0 +1,324 @@
+"""Extract content from links (web pages, PDFs, YouTube transcripts)."""
+import asyncio
+import ipaddress
+import re
+from io import BytesIO
+from typing import Dict, Optional
+from urllib.parse import urlparse, parse_qs
+
+import httpx
+from loguru import logger
+
+from src.config import settings
+import trafilatura
+try:
+    from trafilatura.metadata import extract_metadata as trafilatura_extract_metadata
+except Exception:  # pragma: no cover - optional path
+    trafilatura_extract_metadata = None
+from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, NoTranscriptFound
+from pypdf import PdfReader
+
+
+class LinkExtractionError(RuntimeError):
+    """Raised when link extraction fails."""
+
+
+class LinkExtractor:
+    """Extract text content from supported link types."""
+
+    MAX_BYTES = 2_000_000  # ~2MB
+    TIMEOUT = 10.0
+    MAX_REDIRECTS = 3
+
+    async def extract(self, url: str) -> Dict[str, Optional[str]]:
+        """Extract content and metadata from a link."""
+        normalized = self._normalize_url(url)
+        self._validate_url(normalized)
+
+        if self._is_youtube_url(normalized):
+            video_id = self._extract_youtube_id(normalized)
+            if not video_id:
+                raise LinkExtractionError("YouTube-Link konnte nicht erkannt werden.")
+            transcript = ""
+            title = ""
+
+            if (settings.transcriptapi_key or "").strip():
+                try:
+                    transcript, title = await self._fetch_transcriptapi(normalized)
+                except Exception as exc:
+                    logger.warning(f"TranscriptAPI failed, falling back to direct fetch: {exc}")
+
+            if not transcript:
+                transcript = await self._fetch_youtube_transcript(video_id)
+            if not title:
+                title = await self._fetch_youtube_title(normalized)
+            return {
+                "source_url": normalized,
+                "source_type": "youtube",
+                "title": title or "YouTube Video",
+                "text": transcript,
+            }
+
+        content, content_type = await self._fetch_url(normalized)
+        if self._is_pdf(normalized, content_type):
+            text, title = self._extract_pdf(content)
+            return {
+                "source_url": normalized,
+                "source_type": "pdf",
+                "title": title or "PDF Dokument",
+                "text": text,
+            }
+
+        text, title = self._extract_html(content)
+        if not text:
+            raise LinkExtractionError("Konnte keinen lesbaren Text aus der Seite extrahieren.")
+
+        return {
+            "source_url": normalized,
+            "source_type": "web",
+            "title": title or "Webseite",
+            "text": text,
+        }
+
+    def _normalize_url(self, url: str) -> str:
+        url = url.strip()
+        if not url:
+            raise LinkExtractionError("Bitte einen Link eingeben.")
+        if not re.match(r"^https?://", url, re.IGNORECASE):
+            url = f"https://{url}"
+        return url
+
+    def _validate_url(self, url: str) -> None:
+        parsed = urlparse(url)
+        if parsed.scheme not in ("http", "https"):
+            raise LinkExtractionError("Nur http/https Links sind erlaubt.")
+        if not parsed.hostname:
+            raise LinkExtractionError("Ungültiger Link.")
+
+        hostname = parsed.hostname.lower()
+        if hostname in {"localhost", "127.0.0.1", "0.0.0.0", "::1"}:
+            raise LinkExtractionError("Lokale Links sind nicht erlaubt.")
+        if hostname.endswith(".local"):
+            raise LinkExtractionError("Lokale Links sind nicht erlaubt.")
+
+        # Block private IPs if hostname is an IP literal
+        try:
+            ip = ipaddress.ip_address(hostname)
+            if ip.is_private or ip.is_loopback or ip.is_link_local or ip.is_reserved:
+                raise LinkExtractionError("Lokale Links sind nicht erlaubt.")
+        except ValueError:
+            pass
+
+    async def _fetch_url(self, url: str) -> tuple[bytes, str]:
+        headers = {
+            "User-Agent": "Mozilla/5.0 (compatible; LinkedInWorkflowBot/1.0)"
+        }
+        async with httpx.AsyncClient(
+            follow_redirects=True,
+            max_redirects=self.MAX_REDIRECTS,
+            timeout=self.TIMEOUT,
+            headers=headers,
+            limits=httpx.Limits(max_keepalive_connections=5, max_connections=10),
+        ) as client:
+            response = await client.get(url)
+            response.raise_for_status()
+            content = response.content
+            if len(content) > self.MAX_BYTES:
+                raise LinkExtractionError("Die Seite ist zu groß, um verarbeitet zu werden.")
+            content_type = response.headers.get("content-type", "").lower()
+            return content, content_type
+
+    def _is_pdf(self, url: str, content_type: str) -> bool:
+        if "application/pdf" in content_type:
+            return True
+        return url.lower().endswith(".pdf")
+
+    def _extract_pdf(self, content: bytes) -> tuple[str, Optional[str]]:
+        try:
+            reader = PdfReader(BytesIO(content))
+            text_parts = []
+            for page in reader.pages:
+                page_text = page.extract_text() or ""
+                if page_text:
+                    text_parts.append(page_text)
+            text = "\n".join(text_parts).strip()
+            title = None
+            try:
+                title = reader.metadata.title if reader.metadata else None
+            except Exception:
+                title = None
+            if not text:
+                raise LinkExtractionError("Konnte keinen Text aus dem PDF extrahieren.")
+            return text, title
+        except LinkExtractionError:
+            raise
+        except Exception as exc:
+            raise LinkExtractionError(f"PDF-Extraktion fehlgeschlagen: {exc}") from exc
+
+    def _extract_html(self, content: bytes) -> tuple[str, Optional[str]]:
+        html = content.decode("utf-8", errors="ignore")
+        text = trafilatura.extract(
+            html,
+            include_comments=False,
+            include_tables=False,
+            include_formatting=False,
+            output_format="txt"
+        )
+        title = None
+        try:
+            if trafilatura_extract_metadata:
+                metadata = trafilatura_extract_metadata(html)
+                if metadata and metadata.title:
+                    title = metadata.title
+        except Exception:
+            title = None
+        return (text or "").strip(), title
+
+    def _is_youtube_url(self, url: str) -> bool:
+        host = urlparse(url).hostname or ""
+        host = host.lower()
+        return "youtube.com" in host or "youtu.be" in host
+
+    def _extract_youtube_id(self, url: str) -> Optional[str]:
+        parsed = urlparse(url)
+        host = (parsed.hostname or "").lower()
+        if "youtu.be" in host:
+            return parsed.path.strip("/").split("/")[0] or None
+        if "youtube.com" in host:
+            qs = parse_qs(parsed.query)
+            if "v" in qs and qs["v"]:
+                return qs["v"][0]
+            # /shorts/{id}
+            if parsed.path.startswith("/shorts/"):
+                return parsed.path.split("/")[2] if len(parsed.path.split("/")) > 2 else None
+        return None
+
+    async def _fetch_youtube_transcript(self, video_id: str) -> str:
+        try:
+            cookies = (settings.youtube_cookies or "").strip()
+            languages = ["de", "de-DE", "de-AT", "de-CH", "en", "en-US", "en-GB"]
+
+            # Prefer full transcript list to handle generated vs. manually created captions
+            transcript_list = await asyncio.to_thread(
+                YouTubeTranscriptApi.list_transcripts,
+                video_id,
+                cookies=cookies or None
+            )
+
+            transcript = None
+            try:
+                transcript = transcript_list.find_manually_created_transcript(languages)
+            except Exception:
+                pass
+
+            if transcript is None:
+                try:
+                    transcript = transcript_list.find_generated_transcript(languages)
+                except Exception:
+                    pass
+
+            if transcript is None:
+                try:
+                    transcript = transcript_list.find_transcript(languages)
+                except Exception:
+                    transcript = None
+
+            if transcript is None:
+                raise LinkExtractionError("Kein passendes Transkript gefunden.")
+
+            data = await asyncio.to_thread(transcript.fetch)
+            text = " ".join([item.get("text", "") for item in data]).strip()
+            if not text:
+                raise LinkExtractionError("Kein Transkript verfügbar.")
+            return text
+        except (TranscriptsDisabled, NoTranscriptFound):
+            raise LinkExtractionError("Für dieses Video ist kein Transkript verfügbar.")
+        except Exception as exc:
+            logger.exception(f"YouTube transcript fetch failed: {exc}")
+            debug_dump = await self._debug_youtube_dump(video_id, cookies or None)
+            logger.warning(f"YouTube debug dump for {video_id}:\n{debug_dump}")
+            raise LinkExtractionError(
+                "YouTube-Transkript konnte nicht geladen werden. "
+                "YouTube blockiert den Abruf manchmal. "
+                "Wenn du Zugriff hast, setze die Umgebungsvariable "
+                "`YOUTUBE_COOKIES` mit gültigen Cookies."
+            ) from exc
+
+    async def _debug_youtube_dump(self, video_id: str, cookies: Optional[str]) -> str:
+        urls = [
+            f"https://www.youtube.com/api/timedtext?type=list&v={video_id}",
+            f"https://www.youtube.com/api/timedtext?lang=de&v={video_id}",
+            f"https://www.youtube.com/api/timedtext?lang=de&v={video_id}&fmt=json3",
+            f"https://www.youtube.com/api/timedtext?lang=en&v={video_id}",
+            f"https://www.youtube.com/watch?v={video_id}",
+        ]
+        headers = {
+            "User-Agent": "Mozilla/5.0 (compatible; LinkedInWorkflowBot/1.0)"
+        }
+        if cookies:
+            headers["Cookie"] = cookies
+
+        lines = [f"cookies_used={bool(cookies)}"]
+        async with httpx.AsyncClient(timeout=self.TIMEOUT, headers=headers) as client:
+            for url in urls:
+                try:
+                    resp = await client.get(url)
+                    body = resp.text
+                    snippet = body[:2000].replace("\n", "\\n").replace("\r", "")
+                    lines.append(f"URL: {url}")
+                    lines.append(f"STATUS: {resp.status_code}")
+                    lines.append(f"BODY_SNIPPET: {snippet}")
+                except Exception as exc:
+                    lines.append(f"URL: {url}")
+                    lines.append(f"ERROR: {repr(exc)}")
+
+        return "\n".join(lines)
+
+    async def _fetch_transcriptapi(self, video_url: str) -> tuple[str, str]:
+        api_key = (settings.transcriptapi_key or "").strip()
+        if not api_key:
+            raise LinkExtractionError("TranscriptAPI Key fehlt.")
+
+        endpoint = "https://transcriptapi.com/api/v2/youtube/transcript"
+        params = {
+            "video_url": video_url,
+            "format": "json",
+            "include_timestamp": "false",
+            "send_metadata": "true",
+        }
+        headers = {"Authorization": f"Bearer {api_key}"}
+
+        async with httpx.AsyncClient(timeout=self.TIMEOUT) as client:
+            resp = await client.get(endpoint, params=params, headers=headers)
+            if resp.status_code != 200:
+                raise LinkExtractionError(f"TranscriptAPI Fehler: {resp.status_code}")
+            data = resp.json()
+
+        transcript = data.get("transcript")
+        text = ""
+        if isinstance(transcript, list):
+            text = " ".join([item.get("text", "") for item in transcript]).strip()
+        elif isinstance(transcript, str):
+            text = transcript.strip()
+
+        title = ""
+        meta = data.get("metadata") or {}
+        if isinstance(meta, dict):
+            title = meta.get("title", "") or ""
+
+        if not text:
+            raise LinkExtractionError("TranscriptAPI lieferte kein Transkript.")
+
+        return text, title
+
+    async def _fetch_youtube_title(self, url: str) -> Optional[str]:
+        oembed = f"https://www.youtube.com/oembed?format=json&url={url}"
+        try:
+            async with httpx.AsyncClient(timeout=self.TIMEOUT) as client:
+                response = await client.get(oembed)
+                if response.status_code != 200:
+                    return None
+                data = response.json()
+                return data.get("title")
+        except Exception:
+            return None
--- a/src/web/templates/user/create_post_file.html
+++ b/src/web/templates/user/create_post_file.html
--- a/src/web/templates/user/create_post_link.html
+++ b/src/web/templates/user/create_post_link.html
--- a/src/web/templates/user/create_post_select.html
+++ b/src/web/templates/user/create_post_select.html
@@ -14,7 +14,7 @@
    </div>
    {% endif %}

-    <div class="grid md:grid-cols-2 gap-6">
+    <div class="grid grid-cols-1 md:grid-cols-2 gap-6">
        <a href="/create/wizard"
           class="card-bg border rounded-xl p-6 hover:bg-brand-bg-light transition-colors group {% if limit_reached %}opacity-50 pointer-events-none{% endif %}">
            <div class="w-12 h-12 bg-brand-highlight/20 rounded-lg flex items-center justify-center mb-4">
@@ -38,6 +38,30 @@
            <h2 class="text-lg font-semibold text-white mb-2">Per Chat erstellen</h2>
            <p class="text-gray-400 text-sm">Schnell und flexibel mit dem Chat-Assistenten.</p>
        </a>
+
+        <a href="/create/link-wizard"
+           class="card-bg border rounded-xl p-6 hover:bg-brand-bg-light transition-colors group {% if limit_reached %}opacity-50 pointer-events-none{% endif %}">
+            <div class="w-12 h-12 bg-brand-highlight/20 rounded-lg flex items-center justify-center mb-4">
+                <svg class="w-6 h-6 text-brand-highlight" fill="none" stroke="currentColor" viewBox="0 0 24 24">
+                    <path stroke-linecap="round" stroke-linejoin="round" stroke-width="2"
+                          d="M13.828 10.172a4 4 0 010 5.656l-3 3a4 4 0 11-5.656-5.656l1.5-1.5m4.328-4.328a4 4 0 015.656 0l3 3a4 4 0 11-5.656 5.656l-1.5-1.5"/>
+                </svg>
+            </div>
+            <h2 class="text-lg font-semibold text-white mb-2">Aus Link erstellen</h2>
+            <p class="text-gray-400 text-sm">Webseite, PDF oder YouTube-Link analysieren und daraus posten.</p>
+        </a>
+
+        <a href="/create/file-wizard"
+           class="card-bg border rounded-xl p-6 hover:bg-brand-bg-light transition-colors group {% if limit_reached %}opacity-50 pointer-events-none{% endif %}">
+            <div class="w-12 h-12 bg-brand-highlight/20 rounded-lg flex items-center justify-center mb-4">
+                <svg class="w-6 h-6 text-brand-highlight" fill="none" stroke="currentColor" viewBox="0 0 24 24">
+                    <path stroke-linecap="round" stroke-linejoin="round" stroke-width="2"
+                          d="M7 2h7l5 5v13a2 2 0 01-2 2H7a2 2 0 01-2-2V4a2 2 0 012-2z"/>
+                </svg>
+            </div>
+            <h2 class="text-lg font-semibold text-white mb-2">Aus Datei erstellen</h2>
+            <p class="text-gray-400 text-sm">Datei hochladen und daraus einen Post generieren.</p>
+        </a>
    </div>
 </div>
 {% endblock %}
--- a/src/web/user/routes.py
+++ b/src/web/user/routes.py
@@ -41,6 +41,9 @@ from src.services.background_jobs import (
 )
 from src.services.db_job_manager import job_manager
 from src.services.storage_service import storage
+from src.services.link_extractor import LinkExtractor, LinkExtractionError
+from src.services.file_extractor import FileExtractor, FileExtractionError
+from src.agents.link_topic_builder import LinkTopicBuilderAgent

 # Router for user frontend
 user_router = APIRouter(tags=["user"])
@@ -1813,6 +1816,64 @@ async def create_post_page(request: Request):
    })


+@user_router.get("/create/link-wizard", response_class=HTMLResponse)
+async def create_post_link_page(request: Request):
+    """Create post from link wizard page."""
+    session = require_user_session(request)
+    if not session:
+        return RedirectResponse(url="/login", status_code=302)
+
+    # Check token limit for companies/employees
+    limit_reached = False
+    limit_message = ""
+    if session.account_type in ("company", "employee") and session.company_id:
+        can_create, error_msg, _, _ = await db.check_company_token_limit(UUID(session.company_id))
+        limit_reached = not can_create
+        limit_message = error_msg
+
+    user_id = UUID(session.user_id)
+    profile_picture = await get_user_avatar(session, user_id)
+
+    return templates.TemplateResponse("create_post_link.html", {
+        "request": request,
+        "page": "create",
+        "session": session,
+        "user_id": session.user_id,
+        "limit_reached": limit_reached,
+        "limit_message": limit_message,
+        "profile_picture": profile_picture
+    })
+
+
+@user_router.get("/create/file-wizard", response_class=HTMLResponse)
+async def create_post_file_page(request: Request):
+    """Create post from file wizard page."""
+    session = require_user_session(request)
+    if not session:
+        return RedirectResponse(url="/login", status_code=302)
+
+    # Check token limit for companies/employees
+    limit_reached = False
+    limit_message = ""
+    if session.account_type in ("company", "employee") and session.company_id:
+        can_create, error_msg, _, _ = await db.check_company_token_limit(UUID(session.company_id))
+        limit_reached = not can_create
+        limit_message = error_msg
+
+    user_id = UUID(session.user_id)
+    profile_picture = await get_user_avatar(session, user_id)
+
+    return templates.TemplateResponse("create_post_file.html", {
+        "request": request,
+        "page": "create",
+        "session": session,
+        "user_id": session.user_id,
+        "limit_reached": limit_reached,
+        "limit_message": limit_message,
+        "profile_picture": profile_picture
+    })
+
+
@user_router.get("/chat-create", response_class=HTMLResponse)
 async def chat_create_page(request: Request):
    """Chat-based post creation page."""
@@ -2063,6 +2124,117 @@ async def transcribe_audio(request: Request):
        raise HTTPException(status_code=500, detail=str(e))


+@user_router.post("/api/link-extract")
+async def extract_link(request: Request):
+    """Extract context from a link and build a structured topic."""
+    session = require_user_session(request)
+    if not session:
+        raise HTTPException(status_code=401, detail="Not authenticated")
+
+    # Check token limit for companies/employees
+    if session.account_type in ("company", "employee") and session.company_id:
+        can_create, error_msg, _, _ = await db.check_company_token_limit(UUID(session.company_id))
+        if not can_create:
+            raise HTTPException(status_code=429, detail=error_msg)
+
+    try:
+        data = await request.json()
+        url = (data.get("url") or "").strip()
+        transcript = (data.get("transcript") or "").strip()
+        manual_title = (data.get("title") or "").strip()
+        source_type = (data.get("source_type") or "").strip()
+        source_url = (data.get("source_url") or "").strip()
+
+        if transcript:
+            source = {
+                "source_url": source_url or url,
+                "source_type": source_type or "manual",
+                "title": manual_title or "Manuelles Transkript",
+                "text": transcript
+            }
+        else:
+            extractor = LinkExtractor()
+            try:
+                source = await extractor.extract(url)
+            except LinkExtractionError as exc:
+                raise HTTPException(status_code=400, detail=str(exc)) from exc
+
+        builder = LinkTopicBuilderAgent()
+        builder.set_tracking_context(
+            operation="link_extract",
+            user_id=session.user_id,
+            company_id=session.company_id
+        )
+        topic = await builder.process(source)
+
+        return {"topic": topic, "source": source}
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.exception(f"Link extraction failed: {e}")
+        raise HTTPException(status_code=500, detail=str(e))
+
+
+@user_router.post("/api/file-extract")
+async def extract_file(request: Request):
+    """Extract context from an uploaded file and build a structured topic."""
+    session = require_user_session(request)
+    if not session:
+        raise HTTPException(status_code=401, detail="Not authenticated")
+
+    # Check token limit for companies/employees
+    if session.account_type in ("company", "employee") and session.company_id:
+        can_create, error_msg, _, _ = await db.check_company_token_limit(UUID(session.company_id))
+        if not can_create:
+            raise HTTPException(status_code=429, detail=error_msg)
+
+    try:
+        form = await request.form()
+        upload: UploadFile = form.get("file")  # type: ignore[assignment]
+        if not upload:
+            raise HTTPException(status_code=400, detail="Keine Datei hochgeladen.")
+
+        # Basic validation
+        allowed_ext = {".pdf", ".docx", ".pptx", ".xlsx", ".txt", ".md", ".rtf"}
+        filename = upload.filename or ""
+        ext = Path(filename).suffix.lower()
+        if not ext or ext not in allowed_ext:
+            raise HTTPException(status_code=400, detail="Dateityp nicht unterstützt.")
+
+        file_bytes = await upload.read()
+        max_bytes = 10 * 1024 * 1024  # 10 MB
+        if len(file_bytes) > max_bytes:
+            raise HTTPException(status_code=400, detail="Datei ist zu groß (max 10 MB).")
+
+        extractor = FileExtractor()
+        try:
+            text = extractor.extract_text(file_bytes, filename)
+        except FileExtractionError as exc:
+            raise HTTPException(status_code=400, detail=str(exc)) from exc
+
+        source = {
+            "source_url": "",
+            "source_type": "file",
+            "title": filename or "Datei",
+            "text": text
+        }
+
+        builder = LinkTopicBuilderAgent()
+        builder.set_tracking_context(
+            operation="file_extract",
+            user_id=session.user_id,
+            company_id=session.company_id
+        )
+        topic = await builder.process(source)
+
+        return {"topic": topic, "source": source}
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.exception(f"File extraction failed: {e}")
+        raise HTTPException(status_code=500, detail=str(e))
+
+
@user_router.post("/api/hooks")
 async def generate_hooks(
    request: Request,