diff --git a/requirements.txt b/requirements.txt index 8f0f55c..a7d9ec1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,7 @@ # Core Dependencies python-dotenv==1.0.0 -pydantic==2.5.0 -pydantic-settings==2.1.0 +pydantic==2.7.4 +pydantic-settings==2.3.0 # AI & APIs openai==1.54.0 @@ -41,5 +41,11 @@ uvicorn==0.32.0 jinja2==3.1.4 python-multipart==0.0.9 +# Link Extraction +trafilatura==1.7.0 +youtube-transcript-api==0.6.2 +pypdf==4.2.0 +docling==2.74.0 + # Teams Bot JWT validation PyJWT>=2.8.0 diff --git a/src/agents/link_topic_builder.py b/src/agents/link_topic_builder.py new file mode 100644 index 0000000..51b2dc0 --- /dev/null +++ b/src/agents/link_topic_builder.py @@ -0,0 +1,95 @@ +"""Agent to build a structured topic from extracted link content.""" +import json +from typing import Any, Dict + +from loguru import logger + +from src.agents.base import BaseAgent + + +class LinkTopicBuilderAgent(BaseAgent): + """Build a structured topic dictionary from link content.""" + + def __init__(self) -> None: + super().__init__("link_topic_builder") + + async def process(self, source: Dict[str, Any]) -> Dict[str, Any]: + content = source.get("text", "") + source_url = source.get("source_url", "") + source_type = source.get("source_type", "web") + source_title = source.get("title", "") + + if not content or len(content.strip()) < 200: + raise ValueError("Zu wenig Inhalt, um ein sinnvolles Topic zu erstellen.") + + # Keep content length reasonable for the model + max_chars = 12000 + if len(content) > max_chars: + content = content[:max_chars] + + system_prompt = """Du bist ein Assistent, der aus einem Link-Inhalt ein strukturiertes Topic für einen LinkedIn-Post erstellt. + +Gib NUR valides JSON zurück (keine Erklärungen).""" + + user_prompt = f"""Quelle: +- Typ: {source_type} +- Titel: {source_title} +- URL: {source_url} + +Inhalt: +\"\"\"{content}\"\"\" + + Erstelle ein Topic im folgenden JSON-Format: +{{ + "title": "Kurzer, präziser Titel (max 80 Zeichen)", + "summary": "Detaillierte, vollumfängliche Zusammenfassung (8-12 Sätze, alle Hauptpunkte abdecken)", + "extended_summary": "Längere Zusammenfassung (150-250 Wörter) mit Kontext, Kernaussagen, Belegen und Implikationen", + "outline": ["Abschnitt 1: ...", "Abschnitt 2: ...", "..."], + "fact": "1-2 Sätze als Kernaussage/Fakt", + "key_points": ["Punkt 1", "Punkt 2", "..."], + "key_facts": ["Konkreter Fakt 1", "Konkreter Fakt 2", "..."], + "quotes": ["Kurzes Zitat falls sinnvoll", "..."], + "relevance": "Warum ist das für die Zielgruppe relevant? (1 Satz)", + "category": "Link", + "source": "{source_url}", + "source_title": "{source_title}", + "source_type": "{source_type}" +}} + +Regeln: +- Halte dich an das JSON-Format. +- Keine erfundenen Fakten: nutze NUR den gegebenen Inhalt. +- Wenn keine Zitate vorhanden sind, gib ein leeres Array zurück. +- Summary und extended_summary müssen unterschiedliche Detailtiefe haben. +""" + + result = await self.call_openai( + system_prompt=system_prompt, + user_prompt=user_prompt, + model="gpt-4o", + temperature=0.2, + response_format={"type": "json_object"} + ) + + try: + topic = json.loads(result) + except Exception as exc: + logger.warning(f"Failed to parse topic JSON: {exc}") + raise ValueError("Antwort konnte nicht verarbeitet werden.") + + # Ensure required fields exist + topic.setdefault("category", "Link") + topic.setdefault("source", source_url) + topic.setdefault("source_title", source_title) + topic.setdefault("source_type", source_type) + topic.setdefault("title", source_title or "Link-Thema") + topic.setdefault("summary", "") + topic.setdefault("extended_summary", "") + topic.setdefault("outline", []) + topic.setdefault("fact", topic.get("summary", "")[:300]) + topic.setdefault("key_points", []) + topic.setdefault("key_facts", []) + topic.setdefault("quotes", []) + topic.setdefault("relevance", "Relevantes Thema für die Zielgruppe") + + return topic diff --git a/src/agents/writer.py b/src/agents/writer.py index c52d0bd..0444c89 100644 --- a/src/agents/writer.py +++ b/src/agents/writer.py @@ -351,6 +351,35 @@ class WriterAgent(BaseAgent): if key_facts and isinstance(key_facts, list) and len(key_facts) > 0: facts_section = "\n**KEY FACTS (nutze diese!):**\n" + "\n".join([f"- {f}" for f in key_facts]) + "\n" + summary_section = "" + if topic.get('summary'): + summary_section = f"\n**ZUSAMMENFASSUNG (vom Link):**\n{topic.get('summary')}\n" + + extended_summary_section = "" + if topic.get('extended_summary'): + extended_summary_section = f"\n**DETAILLIERTE ZUSAMMENFASSUNG:**\n{topic.get('extended_summary')}\n" + + outline_section = "" + outline = topic.get('outline', []) + if outline and isinstance(outline, list) and len(outline) > 0: + outline_section = "\n**INHALTS-GLIEDERUNG:**\n" + "\n".join([f"- {o}" for o in outline]) + "\n" + + key_points_section = "" + key_points = topic.get('key_points', []) + if key_points and isinstance(key_points, list) and len(key_points) > 0: + key_points_section = "\n**KERNPUNKTE (vom Link):**\n" + "\n".join([f"- {p}" for p in key_points]) + "\n" + + quotes_section = "" + quotes = topic.get('quotes', []) + if quotes and isinstance(quotes, list) and len(quotes) > 0: + quotes_section = "\n**ZITATE (optional verwenden):**\n" + "\n".join([f"- \"{q}\"" for q in quotes]) + "\n" + + source_section = "" + if topic.get('source_title') or topic.get('source'): + source_title = topic.get('source_title') or "" + source_url = topic.get('source') or "" + source_section = f"\n**QUELLE:** {source_title} {source_url}\n" + why_section = "" if topic.get('why_this_person'): why_section = f"\n**WARUM DU DARÜBER SCHREIBEN SOLLTEST:**\n{topic.get('why_this_person')}\n" @@ -391,7 +420,7 @@ Schreibe einen authentischen LinkedIn-Post, der: {angle_section}{hook_section} **KERN-FAKT / INHALT:** {topic.get('fact', topic.get('description', ''))} -{facts_section}{thoughts_section} +{summary_section}{extended_summary_section}{outline_section}{key_points_section}{facts_section}{quotes_section}{source_section}{thoughts_section} **WARUM RELEVANT:** {topic.get('relevance', 'Aktuelles Thema für die Zielgruppe')} {why_section} @@ -941,6 +970,35 @@ Gib NUR den überarbeiteten Post zurück - keine Kommentare.""" if key_facts and isinstance(key_facts, list) and len(key_facts) > 0: facts_section = "\n**KEY FACTS (nutze diese!):**\n" + "\n".join([f"- {f}" for f in key_facts]) + "\n" + summary_section = "" + if topic.get('summary'): + summary_section = f"\n**ZUSAMMENFASSUNG (vom Link):**\n{topic.get('summary')}\n" + + extended_summary_section = "" + if topic.get('extended_summary'): + extended_summary_section = f"\n**DETAILLIERTE ZUSAMMENFASSUNG:**\n{topic.get('extended_summary')}\n" + + outline_section = "" + outline = topic.get('outline', []) + if outline and isinstance(outline, list) and len(outline) > 0: + outline_section = "\n**INHALTS-GLIEDERUNG:**\n" + "\n".join([f"- {o}" for o in outline]) + "\n" + + key_points_section = "" + key_points = topic.get('key_points', []) + if key_points and isinstance(key_points, list) and len(key_points) > 0: + key_points_section = "\n**KERNPUNKTE (vom Link):**\n" + "\n".join([f"- {p}" for p in key_points]) + "\n" + + quotes_section = "" + quotes = topic.get('quotes', []) + if quotes and isinstance(quotes, list) and len(quotes) > 0: + quotes_section = "\n**ZITATE (optional verwenden):**\n" + "\n".join([f"- \"{q}\"" for q in quotes]) + "\n" + + source_section = "" + if topic.get('source_title') or topic.get('source'): + source_title = topic.get('source_title') or "" + source_url = topic.get('source') or "" + source_section = f"\n**QUELLE:** {source_title} {source_url}\n" + # User thoughts section thoughts_section = "" if user_thoughts: @@ -977,7 +1035,7 @@ Schreibe einen authentischen LinkedIn-Post, der: {angle_section}{hook_section} **KERN-FAKT / INHALT:** {topic.get('fact', topic.get('description', ''))} -{facts_section}{thoughts_section} +{summary_section}{extended_summary_section}{outline_section}{key_points_section}{facts_section}{quotes_section}{source_section}{thoughts_section} **WARUM RELEVANT:** {topic.get('relevance', 'Aktuelles Thema für die Zielgruppe')} @@ -1063,6 +1121,16 @@ Antworte im JSON-Format: if post_type.description: post_type_section += f"\n{post_type.description}" + content_block = topic.get('fact', topic.get('description', 'Keine Details verfügbar')) + if topic.get('summary'): + content_block += f"\n\nZUSAMMENFASSUNG:\n{topic.get('summary')}" + if topic.get('extended_summary'): + content_block += f"\n\nDETAILLIERTE ZUSAMMENFASSUNG:\n{topic.get('extended_summary')}" + if topic.get('outline') and isinstance(topic.get('outline'), list): + content_block += "\n\nGLIEDERUNG:\n" + "\n".join([f"- {o}" for o in topic.get('outline', [])]) + if topic.get('key_points') and isinstance(topic.get('key_points'), list): + content_block += "\n\nKERNPUNKTE:\n" + "\n".join([f"- {p}" for p in topic.get('key_points', [])]) + user_prompt = f"""Generiere 4 Hooks für dieses Thema: THEMA: {topic.get('title', 'Unbekanntes Thema')} @@ -1070,7 +1138,7 @@ THEMA: {topic.get('title', 'Unbekanntes Thema')} KATEGORIE: {topic.get('category', 'Allgemein')} KERN-FAKT/INHALT: -{topic.get('fact', topic.get('description', 'Keine Details verfügbar'))} +{content_block} {thoughts_section}{post_type_section} Generiere jetzt die 4 verschiedenen Hooks im JSON-Format.""" diff --git a/src/config.py b/src/config.py index 8c2c6c2..a3eac96 100644 --- a/src/config.py +++ b/src/config.py @@ -69,6 +69,10 @@ class Settings(BaseSettings): redis_url: str = "redis://redis:6379/0" scheduler_enabled: bool = False # True only on dedicated scheduler container + # YouTube (optional) + youtube_cookies: str = "" # Raw Cookie header value for transcript fetching + transcriptapi_key: str = "" # TranscriptAPI.com key + # Telegram Bot (experimental) telegram_enabled: bool = False telegram_bot_token: str = "" diff --git a/src/services/file_extractor.py b/src/services/file_extractor.py new file mode 100644 index 0000000..a51ceee --- /dev/null +++ b/src/services/file_extractor.py @@ -0,0 +1,54 @@ +"""Extract text content from uploaded files using Docling.""" +import os +import tempfile +from typing import Optional + +from docling.document_converter import DocumentConverter, PdfFormatOption +from docling.datamodel.base_models import InputFormat +from docling.datamodel.pipeline_options import PdfPipelineOptions + + +class FileExtractionError(RuntimeError): + """Raised when file extraction fails.""" + + +class FileExtractor: + """Extract text from files via Docling.""" + + def __init__(self) -> None: + pdf_options = PdfPipelineOptions(do_ocr=False) + self._converter = DocumentConverter( + format_options={ + InputFormat.PDF: PdfFormatOption(pipeline_options=pdf_options) + } + ) + + def extract_text(self, file_bytes: bytes, filename: str) -> str: + if not file_bytes: + raise FileExtractionError("Leere Datei.") + + suffix = "" + if filename and "." in filename: + suffix = os.path.splitext(filename)[1] + + try: + with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp: + tmp.write(file_bytes) + tmp_path = tmp.name + + result = self._converter.convert(tmp_path) + document = result.document + text = document.export_to_markdown().strip() + if not text: + raise FileExtractionError("Keine Inhalte im Dokument gefunden.") + return text + except FileExtractionError: + raise + except Exception as exc: + raise FileExtractionError(f"Datei konnte nicht verarbeitet werden: {exc}") from exc + finally: + try: + if "tmp_path" in locals(): + os.unlink(tmp_path) + except Exception: + pass diff --git a/src/services/link_extractor.py b/src/services/link_extractor.py new file mode 100644 index 0000000..dfd5a54 --- /dev/null +++ b/src/services/link_extractor.py @@ -0,0 +1,324 @@ +"""Extract content from links (web pages, PDFs, YouTube transcripts).""" +import asyncio +import ipaddress +import re +from io import BytesIO +from typing import Dict, Optional +from urllib.parse import urlparse, parse_qs + +import httpx +from loguru import logger + +from src.config import settings +import trafilatura +try: + from trafilatura.metadata import extract_metadata as trafilatura_extract_metadata +except Exception: # pragma: no cover - optional path + trafilatura_extract_metadata = None +from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, NoTranscriptFound +from pypdf import PdfReader + + +class LinkExtractionError(RuntimeError): + """Raised when link extraction fails.""" + + +class LinkExtractor: + """Extract text content from supported link types.""" + + MAX_BYTES = 2_000_000 # ~2MB + TIMEOUT = 10.0 + MAX_REDIRECTS = 3 + + async def extract(self, url: str) -> Dict[str, Optional[str]]: + """Extract content and metadata from a link.""" + normalized = self._normalize_url(url) + self._validate_url(normalized) + + if self._is_youtube_url(normalized): + video_id = self._extract_youtube_id(normalized) + if not video_id: + raise LinkExtractionError("YouTube-Link konnte nicht erkannt werden.") + transcript = "" + title = "" + + if (settings.transcriptapi_key or "").strip(): + try: + transcript, title = await self._fetch_transcriptapi(normalized) + except Exception as exc: + logger.warning(f"TranscriptAPI failed, falling back to direct fetch: {exc}") + + if not transcript: + transcript = await self._fetch_youtube_transcript(video_id) + if not title: + title = await self._fetch_youtube_title(normalized) + return { + "source_url": normalized, + "source_type": "youtube", + "title": title or "YouTube Video", + "text": transcript, + } + + content, content_type = await self._fetch_url(normalized) + if self._is_pdf(normalized, content_type): + text, title = self._extract_pdf(content) + return { + "source_url": normalized, + "source_type": "pdf", + "title": title or "PDF Dokument", + "text": text, + } + + text, title = self._extract_html(content) + if not text: + raise LinkExtractionError("Konnte keinen lesbaren Text aus der Seite extrahieren.") + + return { + "source_url": normalized, + "source_type": "web", + "title": title or "Webseite", + "text": text, + } + + def _normalize_url(self, url: str) -> str: + url = url.strip() + if not url: + raise LinkExtractionError("Bitte einen Link eingeben.") + if not re.match(r"^https?://", url, re.IGNORECASE): + url = f"https://{url}" + return url + + def _validate_url(self, url: str) -> None: + parsed = urlparse(url) + if parsed.scheme not in ("http", "https"): + raise LinkExtractionError("Nur http/https Links sind erlaubt.") + if not parsed.hostname: + raise LinkExtractionError("Ungültiger Link.") + + hostname = parsed.hostname.lower() + if hostname in {"localhost", "127.0.0.1", "0.0.0.0", "::1"}: + raise LinkExtractionError("Lokale Links sind nicht erlaubt.") + if hostname.endswith(".local"): + raise LinkExtractionError("Lokale Links sind nicht erlaubt.") + + # Block private IPs if hostname is an IP literal + try: + ip = ipaddress.ip_address(hostname) + if ip.is_private or ip.is_loopback or ip.is_link_local or ip.is_reserved: + raise LinkExtractionError("Lokale Links sind nicht erlaubt.") + except ValueError: + pass + + async def _fetch_url(self, url: str) -> tuple[bytes, str]: + headers = { + "User-Agent": "Mozilla/5.0 (compatible; LinkedInWorkflowBot/1.0)" + } + async with httpx.AsyncClient( + follow_redirects=True, + max_redirects=self.MAX_REDIRECTS, + timeout=self.TIMEOUT, + headers=headers, + limits=httpx.Limits(max_keepalive_connections=5, max_connections=10), + ) as client: + response = await client.get(url) + response.raise_for_status() + content = response.content + if len(content) > self.MAX_BYTES: + raise LinkExtractionError("Die Seite ist zu groß, um verarbeitet zu werden.") + content_type = response.headers.get("content-type", "").lower() + return content, content_type + + def _is_pdf(self, url: str, content_type: str) -> bool: + if "application/pdf" in content_type: + return True + return url.lower().endswith(".pdf") + + def _extract_pdf(self, content: bytes) -> tuple[str, Optional[str]]: + try: + reader = PdfReader(BytesIO(content)) + text_parts = [] + for page in reader.pages: + page_text = page.extract_text() or "" + if page_text: + text_parts.append(page_text) + text = "\n".join(text_parts).strip() + title = None + try: + title = reader.metadata.title if reader.metadata else None + except Exception: + title = None + if not text: + raise LinkExtractionError("Konnte keinen Text aus dem PDF extrahieren.") + return text, title + except LinkExtractionError: + raise + except Exception as exc: + raise LinkExtractionError(f"PDF-Extraktion fehlgeschlagen: {exc}") from exc + + def _extract_html(self, content: bytes) -> tuple[str, Optional[str]]: + html = content.decode("utf-8", errors="ignore") + text = trafilatura.extract( + html, + include_comments=False, + include_tables=False, + include_formatting=False, + output_format="txt" + ) + title = None + try: + if trafilatura_extract_metadata: + metadata = trafilatura_extract_metadata(html) + if metadata and metadata.title: + title = metadata.title + except Exception: + title = None + return (text or "").strip(), title + + def _is_youtube_url(self, url: str) -> bool: + host = urlparse(url).hostname or "" + host = host.lower() + return "youtube.com" in host or "youtu.be" in host + + def _extract_youtube_id(self, url: str) -> Optional[str]: + parsed = urlparse(url) + host = (parsed.hostname or "").lower() + if "youtu.be" in host: + return parsed.path.strip("/").split("/")[0] or None + if "youtube.com" in host: + qs = parse_qs(parsed.query) + if "v" in qs and qs["v"]: + return qs["v"][0] + # /shorts/{id} + if parsed.path.startswith("/shorts/"): + return parsed.path.split("/")[2] if len(parsed.path.split("/")) > 2 else None + return None + + async def _fetch_youtube_transcript(self, video_id: str) -> str: + try: + cookies = (settings.youtube_cookies or "").strip() + languages = ["de", "de-DE", "de-AT", "de-CH", "en", "en-US", "en-GB"] + + # Prefer full transcript list to handle generated vs. manually created captions + transcript_list = await asyncio.to_thread( + YouTubeTranscriptApi.list_transcripts, + video_id, + cookies=cookies or None + ) + + transcript = None + try: + transcript = transcript_list.find_manually_created_transcript(languages) + except Exception: + pass + + if transcript is None: + try: + transcript = transcript_list.find_generated_transcript(languages) + except Exception: + pass + + if transcript is None: + try: + transcript = transcript_list.find_transcript(languages) + except Exception: + transcript = None + + if transcript is None: + raise LinkExtractionError("Kein passendes Transkript gefunden.") + + data = await asyncio.to_thread(transcript.fetch) + text = " ".join([item.get("text", "") for item in data]).strip() + if not text: + raise LinkExtractionError("Kein Transkript verfügbar.") + return text + except (TranscriptsDisabled, NoTranscriptFound): + raise LinkExtractionError("Für dieses Video ist kein Transkript verfügbar.") + except Exception as exc: + logger.exception(f"YouTube transcript fetch failed: {exc}") + debug_dump = await self._debug_youtube_dump(video_id, cookies or None) + logger.warning(f"YouTube debug dump for {video_id}:\n{debug_dump}") + raise LinkExtractionError( + "YouTube-Transkript konnte nicht geladen werden. " + "YouTube blockiert den Abruf manchmal. " + "Wenn du Zugriff hast, setze die Umgebungsvariable " + "`YOUTUBE_COOKIES` mit gültigen Cookies." + ) from exc + + async def _debug_youtube_dump(self, video_id: str, cookies: Optional[str]) -> str: + urls = [ + f"https://www.youtube.com/api/timedtext?type=list&v={video_id}", + f"https://www.youtube.com/api/timedtext?lang=de&v={video_id}", + f"https://www.youtube.com/api/timedtext?lang=de&v={video_id}&fmt=json3", + f"https://www.youtube.com/api/timedtext?lang=en&v={video_id}", + f"https://www.youtube.com/watch?v={video_id}", + ] + headers = { + "User-Agent": "Mozilla/5.0 (compatible; LinkedInWorkflowBot/1.0)" + } + if cookies: + headers["Cookie"] = cookies + + lines = [f"cookies_used={bool(cookies)}"] + async with httpx.AsyncClient(timeout=self.TIMEOUT, headers=headers) as client: + for url in urls: + try: + resp = await client.get(url) + body = resp.text + snippet = body[:2000].replace("\n", "\\n").replace("\r", "") + lines.append(f"URL: {url}") + lines.append(f"STATUS: {resp.status_code}") + lines.append(f"BODY_SNIPPET: {snippet}") + except Exception as exc: + lines.append(f"URL: {url}") + lines.append(f"ERROR: {repr(exc)}") + + return "\n".join(lines) + + async def _fetch_transcriptapi(self, video_url: str) -> tuple[str, str]: + api_key = (settings.transcriptapi_key or "").strip() + if not api_key: + raise LinkExtractionError("TranscriptAPI Key fehlt.") + + endpoint = "https://transcriptapi.com/api/v2/youtube/transcript" + params = { + "video_url": video_url, + "format": "json", + "include_timestamp": "false", + "send_metadata": "true", + } + headers = {"Authorization": f"Bearer {api_key}"} + + async with httpx.AsyncClient(timeout=self.TIMEOUT) as client: + resp = await client.get(endpoint, params=params, headers=headers) + if resp.status_code != 200: + raise LinkExtractionError(f"TranscriptAPI Fehler: {resp.status_code}") + data = resp.json() + + transcript = data.get("transcript") + text = "" + if isinstance(transcript, list): + text = " ".join([item.get("text", "") for item in transcript]).strip() + elif isinstance(transcript, str): + text = transcript.strip() + + title = "" + meta = data.get("metadata") or {} + if isinstance(meta, dict): + title = meta.get("title", "") or "" + + if not text: + raise LinkExtractionError("TranscriptAPI lieferte kein Transkript.") + + return text, title + + async def _fetch_youtube_title(self, url: str) -> Optional[str]: + oembed = f"https://www.youtube.com/oembed?format=json&url={url}" + try: + async with httpx.AsyncClient(timeout=self.TIMEOUT) as client: + response = await client.get(oembed) + if response.status_code != 200: + return None + data = response.json() + return data.get("title") + except Exception: + return None diff --git a/src/web/templates/user/create_post_file.html b/src/web/templates/user/create_post_file.html new file mode 100644 index 0000000..b74b7e5 --- /dev/null +++ b/src/web/templates/user/create_post_file.html @@ -0,0 +1,1221 @@ +{% extends "base.html" %} +{% block title %}Post aus Datei erstellen - LinkedIn Posts{% endblock %} + +{% block content %} + +
Lade eine Datei hoch und generiere daraus einen LinkedIn Post
+Lade eine Datei hoch (PDF, DOCX, PPTX, XLSX, TXT, MD, RTF). Max. 10 MB.
+ +Analysiere einen Link und generiere daraus einen LinkedIn Post
+Gib einen Link zu einer Webseite, einem PDF oder einem YouTube-Video ein.
+ +