added file and link create
This commit is contained in:
@@ -1,7 +1,7 @@
|
||||
# Core Dependencies
|
||||
python-dotenv==1.0.0
|
||||
pydantic==2.5.0
|
||||
pydantic-settings==2.1.0
|
||||
pydantic==2.7.4
|
||||
pydantic-settings==2.3.0
|
||||
|
||||
# AI & APIs
|
||||
openai==1.54.0
|
||||
@@ -41,5 +41,11 @@ uvicorn==0.32.0
|
||||
jinja2==3.1.4
|
||||
python-multipart==0.0.9
|
||||
|
||||
# Link Extraction
|
||||
trafilatura==1.7.0
|
||||
youtube-transcript-api==0.6.2
|
||||
pypdf==4.2.0
|
||||
docling==2.74.0
|
||||
|
||||
# Teams Bot JWT validation
|
||||
PyJWT>=2.8.0
|
||||
|
||||
95
src/agents/link_topic_builder.py
Normal file
95
src/agents/link_topic_builder.py
Normal file
@@ -0,0 +1,95 @@
|
||||
"""Agent to build a structured topic from extracted link content."""
|
||||
import json
|
||||
from typing import Any, Dict
|
||||
|
||||
from loguru import logger
|
||||
|
||||
from src.agents.base import BaseAgent
|
||||
|
||||
|
||||
class LinkTopicBuilderAgent(BaseAgent):
|
||||
"""Build a structured topic dictionary from link content."""
|
||||
|
||||
def __init__(self) -> None:
|
||||
super().__init__("link_topic_builder")
|
||||
|
||||
async def process(self, source: Dict[str, Any]) -> Dict[str, Any]:
|
||||
content = source.get("text", "")
|
||||
source_url = source.get("source_url", "")
|
||||
source_type = source.get("source_type", "web")
|
||||
source_title = source.get("title", "")
|
||||
|
||||
if not content or len(content.strip()) < 200:
|
||||
raise ValueError("Zu wenig Inhalt, um ein sinnvolles Topic zu erstellen.")
|
||||
|
||||
# Keep content length reasonable for the model
|
||||
max_chars = 12000
|
||||
if len(content) > max_chars:
|
||||
content = content[:max_chars]
|
||||
|
||||
system_prompt = """Du bist ein Assistent, der aus einem Link-Inhalt ein strukturiertes Topic für einen LinkedIn-Post erstellt.
|
||||
|
||||
Gib NUR valides JSON zurück (keine Erklärungen)."""
|
||||
|
||||
user_prompt = f"""Quelle:
|
||||
- Typ: {source_type}
|
||||
- Titel: {source_title}
|
||||
- URL: {source_url}
|
||||
|
||||
Inhalt:
|
||||
\"\"\"{content}\"\"\"
|
||||
|
||||
Erstelle ein Topic im folgenden JSON-Format:
|
||||
{{
|
||||
"title": "Kurzer, präziser Titel (max 80 Zeichen)",
|
||||
"summary": "Detaillierte, vollumfängliche Zusammenfassung (8-12 Sätze, alle Hauptpunkte abdecken)",
|
||||
"extended_summary": "Längere Zusammenfassung (150-250 Wörter) mit Kontext, Kernaussagen, Belegen und Implikationen",
|
||||
"outline": ["Abschnitt 1: ...", "Abschnitt 2: ...", "..."],
|
||||
"fact": "1-2 Sätze als Kernaussage/Fakt",
|
||||
"key_points": ["Punkt 1", "Punkt 2", "..."],
|
||||
"key_facts": ["Konkreter Fakt 1", "Konkreter Fakt 2", "..."],
|
||||
"quotes": ["Kurzes Zitat falls sinnvoll", "..."],
|
||||
"relevance": "Warum ist das für die Zielgruppe relevant? (1 Satz)",
|
||||
"category": "Link",
|
||||
"source": "{source_url}",
|
||||
"source_title": "{source_title}",
|
||||
"source_type": "{source_type}"
|
||||
}}
|
||||
|
||||
Regeln:
|
||||
- Halte dich an das JSON-Format.
|
||||
- Keine erfundenen Fakten: nutze NUR den gegebenen Inhalt.
|
||||
- Wenn keine Zitate vorhanden sind, gib ein leeres Array zurück.
|
||||
- Summary und extended_summary müssen unterschiedliche Detailtiefe haben.
|
||||
"""
|
||||
|
||||
result = await self.call_openai(
|
||||
system_prompt=system_prompt,
|
||||
user_prompt=user_prompt,
|
||||
model="gpt-4o",
|
||||
temperature=0.2,
|
||||
response_format={"type": "json_object"}
|
||||
)
|
||||
|
||||
try:
|
||||
topic = json.loads(result)
|
||||
except Exception as exc:
|
||||
logger.warning(f"Failed to parse topic JSON: {exc}")
|
||||
raise ValueError("Antwort konnte nicht verarbeitet werden.")
|
||||
|
||||
# Ensure required fields exist
|
||||
topic.setdefault("category", "Link")
|
||||
topic.setdefault("source", source_url)
|
||||
topic.setdefault("source_title", source_title)
|
||||
topic.setdefault("source_type", source_type)
|
||||
topic.setdefault("title", source_title or "Link-Thema")
|
||||
topic.setdefault("summary", "")
|
||||
topic.setdefault("extended_summary", "")
|
||||
topic.setdefault("outline", [])
|
||||
topic.setdefault("fact", topic.get("summary", "")[:300])
|
||||
topic.setdefault("key_points", [])
|
||||
topic.setdefault("key_facts", [])
|
||||
topic.setdefault("quotes", [])
|
||||
topic.setdefault("relevance", "Relevantes Thema für die Zielgruppe")
|
||||
|
||||
return topic
|
||||
@@ -351,6 +351,35 @@ class WriterAgent(BaseAgent):
|
||||
if key_facts and isinstance(key_facts, list) and len(key_facts) > 0:
|
||||
facts_section = "\n**KEY FACTS (nutze diese!):**\n" + "\n".join([f"- {f}" for f in key_facts]) + "\n"
|
||||
|
||||
summary_section = ""
|
||||
if topic.get('summary'):
|
||||
summary_section = f"\n**ZUSAMMENFASSUNG (vom Link):**\n{topic.get('summary')}\n"
|
||||
|
||||
extended_summary_section = ""
|
||||
if topic.get('extended_summary'):
|
||||
extended_summary_section = f"\n**DETAILLIERTE ZUSAMMENFASSUNG:**\n{topic.get('extended_summary')}\n"
|
||||
|
||||
outline_section = ""
|
||||
outline = topic.get('outline', [])
|
||||
if outline and isinstance(outline, list) and len(outline) > 0:
|
||||
outline_section = "\n**INHALTS-GLIEDERUNG:**\n" + "\n".join([f"- {o}" for o in outline]) + "\n"
|
||||
|
||||
key_points_section = ""
|
||||
key_points = topic.get('key_points', [])
|
||||
if key_points and isinstance(key_points, list) and len(key_points) > 0:
|
||||
key_points_section = "\n**KERNPUNKTE (vom Link):**\n" + "\n".join([f"- {p}" for p in key_points]) + "\n"
|
||||
|
||||
quotes_section = ""
|
||||
quotes = topic.get('quotes', [])
|
||||
if quotes and isinstance(quotes, list) and len(quotes) > 0:
|
||||
quotes_section = "\n**ZITATE (optional verwenden):**\n" + "\n".join([f"- \"{q}\"" for q in quotes]) + "\n"
|
||||
|
||||
source_section = ""
|
||||
if topic.get('source_title') or topic.get('source'):
|
||||
source_title = topic.get('source_title') or ""
|
||||
source_url = topic.get('source') or ""
|
||||
source_section = f"\n**QUELLE:** {source_title} {source_url}\n"
|
||||
|
||||
why_section = ""
|
||||
if topic.get('why_this_person'):
|
||||
why_section = f"\n**WARUM DU DARÜBER SCHREIBEN SOLLTEST:**\n{topic.get('why_this_person')}\n"
|
||||
@@ -391,7 +420,7 @@ Schreibe einen authentischen LinkedIn-Post, der:
|
||||
{angle_section}{hook_section}
|
||||
**KERN-FAKT / INHALT:**
|
||||
{topic.get('fact', topic.get('description', ''))}
|
||||
{facts_section}{thoughts_section}
|
||||
{summary_section}{extended_summary_section}{outline_section}{key_points_section}{facts_section}{quotes_section}{source_section}{thoughts_section}
|
||||
**WARUM RELEVANT:**
|
||||
{topic.get('relevance', 'Aktuelles Thema für die Zielgruppe')}
|
||||
{why_section}
|
||||
@@ -941,6 +970,35 @@ Gib NUR den überarbeiteten Post zurück - keine Kommentare."""
|
||||
if key_facts and isinstance(key_facts, list) and len(key_facts) > 0:
|
||||
facts_section = "\n**KEY FACTS (nutze diese!):**\n" + "\n".join([f"- {f}" for f in key_facts]) + "\n"
|
||||
|
||||
summary_section = ""
|
||||
if topic.get('summary'):
|
||||
summary_section = f"\n**ZUSAMMENFASSUNG (vom Link):**\n{topic.get('summary')}\n"
|
||||
|
||||
extended_summary_section = ""
|
||||
if topic.get('extended_summary'):
|
||||
extended_summary_section = f"\n**DETAILLIERTE ZUSAMMENFASSUNG:**\n{topic.get('extended_summary')}\n"
|
||||
|
||||
outline_section = ""
|
||||
outline = topic.get('outline', [])
|
||||
if outline and isinstance(outline, list) and len(outline) > 0:
|
||||
outline_section = "\n**INHALTS-GLIEDERUNG:**\n" + "\n".join([f"- {o}" for o in outline]) + "\n"
|
||||
|
||||
key_points_section = ""
|
||||
key_points = topic.get('key_points', [])
|
||||
if key_points and isinstance(key_points, list) and len(key_points) > 0:
|
||||
key_points_section = "\n**KERNPUNKTE (vom Link):**\n" + "\n".join([f"- {p}" for p in key_points]) + "\n"
|
||||
|
||||
quotes_section = ""
|
||||
quotes = topic.get('quotes', [])
|
||||
if quotes and isinstance(quotes, list) and len(quotes) > 0:
|
||||
quotes_section = "\n**ZITATE (optional verwenden):**\n" + "\n".join([f"- \"{q}\"" for q in quotes]) + "\n"
|
||||
|
||||
source_section = ""
|
||||
if topic.get('source_title') or topic.get('source'):
|
||||
source_title = topic.get('source_title') or ""
|
||||
source_url = topic.get('source') or ""
|
||||
source_section = f"\n**QUELLE:** {source_title} {source_url}\n"
|
||||
|
||||
# User thoughts section
|
||||
thoughts_section = ""
|
||||
if user_thoughts:
|
||||
@@ -977,7 +1035,7 @@ Schreibe einen authentischen LinkedIn-Post, der:
|
||||
{angle_section}{hook_section}
|
||||
**KERN-FAKT / INHALT:**
|
||||
{topic.get('fact', topic.get('description', ''))}
|
||||
{facts_section}{thoughts_section}
|
||||
{summary_section}{extended_summary_section}{outline_section}{key_points_section}{facts_section}{quotes_section}{source_section}{thoughts_section}
|
||||
**WARUM RELEVANT:**
|
||||
{topic.get('relevance', 'Aktuelles Thema für die Zielgruppe')}
|
||||
|
||||
@@ -1063,6 +1121,16 @@ Antworte im JSON-Format:
|
||||
if post_type.description:
|
||||
post_type_section += f"\n{post_type.description}"
|
||||
|
||||
content_block = topic.get('fact', topic.get('description', 'Keine Details verfügbar'))
|
||||
if topic.get('summary'):
|
||||
content_block += f"\n\nZUSAMMENFASSUNG:\n{topic.get('summary')}"
|
||||
if topic.get('extended_summary'):
|
||||
content_block += f"\n\nDETAILLIERTE ZUSAMMENFASSUNG:\n{topic.get('extended_summary')}"
|
||||
if topic.get('outline') and isinstance(topic.get('outline'), list):
|
||||
content_block += "\n\nGLIEDERUNG:\n" + "\n".join([f"- {o}" for o in topic.get('outline', [])])
|
||||
if topic.get('key_points') and isinstance(topic.get('key_points'), list):
|
||||
content_block += "\n\nKERNPUNKTE:\n" + "\n".join([f"- {p}" for p in topic.get('key_points', [])])
|
||||
|
||||
user_prompt = f"""Generiere 4 Hooks für dieses Thema:
|
||||
|
||||
THEMA: {topic.get('title', 'Unbekanntes Thema')}
|
||||
@@ -1070,7 +1138,7 @@ THEMA: {topic.get('title', 'Unbekanntes Thema')}
|
||||
KATEGORIE: {topic.get('category', 'Allgemein')}
|
||||
|
||||
KERN-FAKT/INHALT:
|
||||
{topic.get('fact', topic.get('description', 'Keine Details verfügbar'))}
|
||||
{content_block}
|
||||
{thoughts_section}{post_type_section}
|
||||
|
||||
Generiere jetzt die 4 verschiedenen Hooks im JSON-Format."""
|
||||
|
||||
@@ -69,6 +69,10 @@ class Settings(BaseSettings):
|
||||
redis_url: str = "redis://redis:6379/0"
|
||||
scheduler_enabled: bool = False # True only on dedicated scheduler container
|
||||
|
||||
# YouTube (optional)
|
||||
youtube_cookies: str = "" # Raw Cookie header value for transcript fetching
|
||||
transcriptapi_key: str = "" # TranscriptAPI.com key
|
||||
|
||||
# Telegram Bot (experimental)
|
||||
telegram_enabled: bool = False
|
||||
telegram_bot_token: str = ""
|
||||
|
||||
54
src/services/file_extractor.py
Normal file
54
src/services/file_extractor.py
Normal file
@@ -0,0 +1,54 @@
|
||||
"""Extract text content from uploaded files using Docling."""
|
||||
import os
|
||||
import tempfile
|
||||
from typing import Optional
|
||||
|
||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
||||
|
||||
|
||||
class FileExtractionError(RuntimeError):
|
||||
"""Raised when file extraction fails."""
|
||||
|
||||
|
||||
class FileExtractor:
|
||||
"""Extract text from files via Docling."""
|
||||
|
||||
def __init__(self) -> None:
|
||||
pdf_options = PdfPipelineOptions(do_ocr=False)
|
||||
self._converter = DocumentConverter(
|
||||
format_options={
|
||||
InputFormat.PDF: PdfFormatOption(pipeline_options=pdf_options)
|
||||
}
|
||||
)
|
||||
|
||||
def extract_text(self, file_bytes: bytes, filename: str) -> str:
|
||||
if not file_bytes:
|
||||
raise FileExtractionError("Leere Datei.")
|
||||
|
||||
suffix = ""
|
||||
if filename and "." in filename:
|
||||
suffix = os.path.splitext(filename)[1]
|
||||
|
||||
try:
|
||||
with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
|
||||
tmp.write(file_bytes)
|
||||
tmp_path = tmp.name
|
||||
|
||||
result = self._converter.convert(tmp_path)
|
||||
document = result.document
|
||||
text = document.export_to_markdown().strip()
|
||||
if not text:
|
||||
raise FileExtractionError("Keine Inhalte im Dokument gefunden.")
|
||||
return text
|
||||
except FileExtractionError:
|
||||
raise
|
||||
except Exception as exc:
|
||||
raise FileExtractionError(f"Datei konnte nicht verarbeitet werden: {exc}") from exc
|
||||
finally:
|
||||
try:
|
||||
if "tmp_path" in locals():
|
||||
os.unlink(tmp_path)
|
||||
except Exception:
|
||||
pass
|
||||
324
src/services/link_extractor.py
Normal file
324
src/services/link_extractor.py
Normal file
@@ -0,0 +1,324 @@
|
||||
"""Extract content from links (web pages, PDFs, YouTube transcripts)."""
|
||||
import asyncio
|
||||
import ipaddress
|
||||
import re
|
||||
from io import BytesIO
|
||||
from typing import Dict, Optional
|
||||
from urllib.parse import urlparse, parse_qs
|
||||
|
||||
import httpx
|
||||
from loguru import logger
|
||||
|
||||
from src.config import settings
|
||||
import trafilatura
|
||||
try:
|
||||
from trafilatura.metadata import extract_metadata as trafilatura_extract_metadata
|
||||
except Exception: # pragma: no cover - optional path
|
||||
trafilatura_extract_metadata = None
|
||||
from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, NoTranscriptFound
|
||||
from pypdf import PdfReader
|
||||
|
||||
|
||||
class LinkExtractionError(RuntimeError):
|
||||
"""Raised when link extraction fails."""
|
||||
|
||||
|
||||
class LinkExtractor:
|
||||
"""Extract text content from supported link types."""
|
||||
|
||||
MAX_BYTES = 2_000_000 # ~2MB
|
||||
TIMEOUT = 10.0
|
||||
MAX_REDIRECTS = 3
|
||||
|
||||
async def extract(self, url: str) -> Dict[str, Optional[str]]:
|
||||
"""Extract content and metadata from a link."""
|
||||
normalized = self._normalize_url(url)
|
||||
self._validate_url(normalized)
|
||||
|
||||
if self._is_youtube_url(normalized):
|
||||
video_id = self._extract_youtube_id(normalized)
|
||||
if not video_id:
|
||||
raise LinkExtractionError("YouTube-Link konnte nicht erkannt werden.")
|
||||
transcript = ""
|
||||
title = ""
|
||||
|
||||
if (settings.transcriptapi_key or "").strip():
|
||||
try:
|
||||
transcript, title = await self._fetch_transcriptapi(normalized)
|
||||
except Exception as exc:
|
||||
logger.warning(f"TranscriptAPI failed, falling back to direct fetch: {exc}")
|
||||
|
||||
if not transcript:
|
||||
transcript = await self._fetch_youtube_transcript(video_id)
|
||||
if not title:
|
||||
title = await self._fetch_youtube_title(normalized)
|
||||
return {
|
||||
"source_url": normalized,
|
||||
"source_type": "youtube",
|
||||
"title": title or "YouTube Video",
|
||||
"text": transcript,
|
||||
}
|
||||
|
||||
content, content_type = await self._fetch_url(normalized)
|
||||
if self._is_pdf(normalized, content_type):
|
||||
text, title = self._extract_pdf(content)
|
||||
return {
|
||||
"source_url": normalized,
|
||||
"source_type": "pdf",
|
||||
"title": title or "PDF Dokument",
|
||||
"text": text,
|
||||
}
|
||||
|
||||
text, title = self._extract_html(content)
|
||||
if not text:
|
||||
raise LinkExtractionError("Konnte keinen lesbaren Text aus der Seite extrahieren.")
|
||||
|
||||
return {
|
||||
"source_url": normalized,
|
||||
"source_type": "web",
|
||||
"title": title or "Webseite",
|
||||
"text": text,
|
||||
}
|
||||
|
||||
def _normalize_url(self, url: str) -> str:
|
||||
url = url.strip()
|
||||
if not url:
|
||||
raise LinkExtractionError("Bitte einen Link eingeben.")
|
||||
if not re.match(r"^https?://", url, re.IGNORECASE):
|
||||
url = f"https://{url}"
|
||||
return url
|
||||
|
||||
def _validate_url(self, url: str) -> None:
|
||||
parsed = urlparse(url)
|
||||
if parsed.scheme not in ("http", "https"):
|
||||
raise LinkExtractionError("Nur http/https Links sind erlaubt.")
|
||||
if not parsed.hostname:
|
||||
raise LinkExtractionError("Ungültiger Link.")
|
||||
|
||||
hostname = parsed.hostname.lower()
|
||||
if hostname in {"localhost", "127.0.0.1", "0.0.0.0", "::1"}:
|
||||
raise LinkExtractionError("Lokale Links sind nicht erlaubt.")
|
||||
if hostname.endswith(".local"):
|
||||
raise LinkExtractionError("Lokale Links sind nicht erlaubt.")
|
||||
|
||||
# Block private IPs if hostname is an IP literal
|
||||
try:
|
||||
ip = ipaddress.ip_address(hostname)
|
||||
if ip.is_private or ip.is_loopback or ip.is_link_local or ip.is_reserved:
|
||||
raise LinkExtractionError("Lokale Links sind nicht erlaubt.")
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
async def _fetch_url(self, url: str) -> tuple[bytes, str]:
|
||||
headers = {
|
||||
"User-Agent": "Mozilla/5.0 (compatible; LinkedInWorkflowBot/1.0)"
|
||||
}
|
||||
async with httpx.AsyncClient(
|
||||
follow_redirects=True,
|
||||
max_redirects=self.MAX_REDIRECTS,
|
||||
timeout=self.TIMEOUT,
|
||||
headers=headers,
|
||||
limits=httpx.Limits(max_keepalive_connections=5, max_connections=10),
|
||||
) as client:
|
||||
response = await client.get(url)
|
||||
response.raise_for_status()
|
||||
content = response.content
|
||||
if len(content) > self.MAX_BYTES:
|
||||
raise LinkExtractionError("Die Seite ist zu groß, um verarbeitet zu werden.")
|
||||
content_type = response.headers.get("content-type", "").lower()
|
||||
return content, content_type
|
||||
|
||||
def _is_pdf(self, url: str, content_type: str) -> bool:
|
||||
if "application/pdf" in content_type:
|
||||
return True
|
||||
return url.lower().endswith(".pdf")
|
||||
|
||||
def _extract_pdf(self, content: bytes) -> tuple[str, Optional[str]]:
|
||||
try:
|
||||
reader = PdfReader(BytesIO(content))
|
||||
text_parts = []
|
||||
for page in reader.pages:
|
||||
page_text = page.extract_text() or ""
|
||||
if page_text:
|
||||
text_parts.append(page_text)
|
||||
text = "\n".join(text_parts).strip()
|
||||
title = None
|
||||
try:
|
||||
title = reader.metadata.title if reader.metadata else None
|
||||
except Exception:
|
||||
title = None
|
||||
if not text:
|
||||
raise LinkExtractionError("Konnte keinen Text aus dem PDF extrahieren.")
|
||||
return text, title
|
||||
except LinkExtractionError:
|
||||
raise
|
||||
except Exception as exc:
|
||||
raise LinkExtractionError(f"PDF-Extraktion fehlgeschlagen: {exc}") from exc
|
||||
|
||||
def _extract_html(self, content: bytes) -> tuple[str, Optional[str]]:
|
||||
html = content.decode("utf-8", errors="ignore")
|
||||
text = trafilatura.extract(
|
||||
html,
|
||||
include_comments=False,
|
||||
include_tables=False,
|
||||
include_formatting=False,
|
||||
output_format="txt"
|
||||
)
|
||||
title = None
|
||||
try:
|
||||
if trafilatura_extract_metadata:
|
||||
metadata = trafilatura_extract_metadata(html)
|
||||
if metadata and metadata.title:
|
||||
title = metadata.title
|
||||
except Exception:
|
||||
title = None
|
||||
return (text or "").strip(), title
|
||||
|
||||
def _is_youtube_url(self, url: str) -> bool:
|
||||
host = urlparse(url).hostname or ""
|
||||
host = host.lower()
|
||||
return "youtube.com" in host or "youtu.be" in host
|
||||
|
||||
def _extract_youtube_id(self, url: str) -> Optional[str]:
|
||||
parsed = urlparse(url)
|
||||
host = (parsed.hostname or "").lower()
|
||||
if "youtu.be" in host:
|
||||
return parsed.path.strip("/").split("/")[0] or None
|
||||
if "youtube.com" in host:
|
||||
qs = parse_qs(parsed.query)
|
||||
if "v" in qs and qs["v"]:
|
||||
return qs["v"][0]
|
||||
# /shorts/{id}
|
||||
if parsed.path.startswith("/shorts/"):
|
||||
return parsed.path.split("/")[2] if len(parsed.path.split("/")) > 2 else None
|
||||
return None
|
||||
|
||||
async def _fetch_youtube_transcript(self, video_id: str) -> str:
|
||||
try:
|
||||
cookies = (settings.youtube_cookies or "").strip()
|
||||
languages = ["de", "de-DE", "de-AT", "de-CH", "en", "en-US", "en-GB"]
|
||||
|
||||
# Prefer full transcript list to handle generated vs. manually created captions
|
||||
transcript_list = await asyncio.to_thread(
|
||||
YouTubeTranscriptApi.list_transcripts,
|
||||
video_id,
|
||||
cookies=cookies or None
|
||||
)
|
||||
|
||||
transcript = None
|
||||
try:
|
||||
transcript = transcript_list.find_manually_created_transcript(languages)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
if transcript is None:
|
||||
try:
|
||||
transcript = transcript_list.find_generated_transcript(languages)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
if transcript is None:
|
||||
try:
|
||||
transcript = transcript_list.find_transcript(languages)
|
||||
except Exception:
|
||||
transcript = None
|
||||
|
||||
if transcript is None:
|
||||
raise LinkExtractionError("Kein passendes Transkript gefunden.")
|
||||
|
||||
data = await asyncio.to_thread(transcript.fetch)
|
||||
text = " ".join([item.get("text", "") for item in data]).strip()
|
||||
if not text:
|
||||
raise LinkExtractionError("Kein Transkript verfügbar.")
|
||||
return text
|
||||
except (TranscriptsDisabled, NoTranscriptFound):
|
||||
raise LinkExtractionError("Für dieses Video ist kein Transkript verfügbar.")
|
||||
except Exception as exc:
|
||||
logger.exception(f"YouTube transcript fetch failed: {exc}")
|
||||
debug_dump = await self._debug_youtube_dump(video_id, cookies or None)
|
||||
logger.warning(f"YouTube debug dump for {video_id}:\n{debug_dump}")
|
||||
raise LinkExtractionError(
|
||||
"YouTube-Transkript konnte nicht geladen werden. "
|
||||
"YouTube blockiert den Abruf manchmal. "
|
||||
"Wenn du Zugriff hast, setze die Umgebungsvariable "
|
||||
"`YOUTUBE_COOKIES` mit gültigen Cookies."
|
||||
) from exc
|
||||
|
||||
async def _debug_youtube_dump(self, video_id: str, cookies: Optional[str]) -> str:
|
||||
urls = [
|
||||
f"https://www.youtube.com/api/timedtext?type=list&v={video_id}",
|
||||
f"https://www.youtube.com/api/timedtext?lang=de&v={video_id}",
|
||||
f"https://www.youtube.com/api/timedtext?lang=de&v={video_id}&fmt=json3",
|
||||
f"https://www.youtube.com/api/timedtext?lang=en&v={video_id}",
|
||||
f"https://www.youtube.com/watch?v={video_id}",
|
||||
]
|
||||
headers = {
|
||||
"User-Agent": "Mozilla/5.0 (compatible; LinkedInWorkflowBot/1.0)"
|
||||
}
|
||||
if cookies:
|
||||
headers["Cookie"] = cookies
|
||||
|
||||
lines = [f"cookies_used={bool(cookies)}"]
|
||||
async with httpx.AsyncClient(timeout=self.TIMEOUT, headers=headers) as client:
|
||||
for url in urls:
|
||||
try:
|
||||
resp = await client.get(url)
|
||||
body = resp.text
|
||||
snippet = body[:2000].replace("\n", "\\n").replace("\r", "")
|
||||
lines.append(f"URL: {url}")
|
||||
lines.append(f"STATUS: {resp.status_code}")
|
||||
lines.append(f"BODY_SNIPPET: {snippet}")
|
||||
except Exception as exc:
|
||||
lines.append(f"URL: {url}")
|
||||
lines.append(f"ERROR: {repr(exc)}")
|
||||
|
||||
return "\n".join(lines)
|
||||
|
||||
async def _fetch_transcriptapi(self, video_url: str) -> tuple[str, str]:
|
||||
api_key = (settings.transcriptapi_key or "").strip()
|
||||
if not api_key:
|
||||
raise LinkExtractionError("TranscriptAPI Key fehlt.")
|
||||
|
||||
endpoint = "https://transcriptapi.com/api/v2/youtube/transcript"
|
||||
params = {
|
||||
"video_url": video_url,
|
||||
"format": "json",
|
||||
"include_timestamp": "false",
|
||||
"send_metadata": "true",
|
||||
}
|
||||
headers = {"Authorization": f"Bearer {api_key}"}
|
||||
|
||||
async with httpx.AsyncClient(timeout=self.TIMEOUT) as client:
|
||||
resp = await client.get(endpoint, params=params, headers=headers)
|
||||
if resp.status_code != 200:
|
||||
raise LinkExtractionError(f"TranscriptAPI Fehler: {resp.status_code}")
|
||||
data = resp.json()
|
||||
|
||||
transcript = data.get("transcript")
|
||||
text = ""
|
||||
if isinstance(transcript, list):
|
||||
text = " ".join([item.get("text", "") for item in transcript]).strip()
|
||||
elif isinstance(transcript, str):
|
||||
text = transcript.strip()
|
||||
|
||||
title = ""
|
||||
meta = data.get("metadata") or {}
|
||||
if isinstance(meta, dict):
|
||||
title = meta.get("title", "") or ""
|
||||
|
||||
if not text:
|
||||
raise LinkExtractionError("TranscriptAPI lieferte kein Transkript.")
|
||||
|
||||
return text, title
|
||||
|
||||
async def _fetch_youtube_title(self, url: str) -> Optional[str]:
|
||||
oembed = f"https://www.youtube.com/oembed?format=json&url={url}"
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=self.TIMEOUT) as client:
|
||||
response = await client.get(oembed)
|
||||
if response.status_code != 200:
|
||||
return None
|
||||
data = response.json()
|
||||
return data.get("title")
|
||||
except Exception:
|
||||
return None
|
||||
1221
src/web/templates/user/create_post_file.html
Normal file
1221
src/web/templates/user/create_post_file.html
Normal file
File diff suppressed because it is too large
Load Diff
1234
src/web/templates/user/create_post_link.html
Normal file
1234
src/web/templates/user/create_post_link.html
Normal file
File diff suppressed because it is too large
Load Diff
@@ -14,7 +14,7 @@
|
||||
</div>
|
||||
{% endif %}
|
||||
|
||||
<div class="grid md:grid-cols-2 gap-6">
|
||||
<div class="grid grid-cols-1 md:grid-cols-2 gap-6">
|
||||
<a href="/create/wizard"
|
||||
class="card-bg border rounded-xl p-6 hover:bg-brand-bg-light transition-colors group {% if limit_reached %}opacity-50 pointer-events-none{% endif %}">
|
||||
<div class="w-12 h-12 bg-brand-highlight/20 rounded-lg flex items-center justify-center mb-4">
|
||||
@@ -38,6 +38,30 @@
|
||||
<h2 class="text-lg font-semibold text-white mb-2">Per Chat erstellen</h2>
|
||||
<p class="text-gray-400 text-sm">Schnell und flexibel mit dem Chat-Assistenten.</p>
|
||||
</a>
|
||||
|
||||
<a href="/create/link-wizard"
|
||||
class="card-bg border rounded-xl p-6 hover:bg-brand-bg-light transition-colors group {% if limit_reached %}opacity-50 pointer-events-none{% endif %}">
|
||||
<div class="w-12 h-12 bg-brand-highlight/20 rounded-lg flex items-center justify-center mb-4">
|
||||
<svg class="w-6 h-6 text-brand-highlight" fill="none" stroke="currentColor" viewBox="0 0 24 24">
|
||||
<path stroke-linecap="round" stroke-linejoin="round" stroke-width="2"
|
||||
d="M13.828 10.172a4 4 0 010 5.656l-3 3a4 4 0 11-5.656-5.656l1.5-1.5m4.328-4.328a4 4 0 015.656 0l3 3a4 4 0 11-5.656 5.656l-1.5-1.5"/>
|
||||
</svg>
|
||||
</div>
|
||||
<h2 class="text-lg font-semibold text-white mb-2">Aus Link erstellen</h2>
|
||||
<p class="text-gray-400 text-sm">Webseite, PDF oder YouTube-Link analysieren und daraus posten.</p>
|
||||
</a>
|
||||
|
||||
<a href="/create/file-wizard"
|
||||
class="card-bg border rounded-xl p-6 hover:bg-brand-bg-light transition-colors group {% if limit_reached %}opacity-50 pointer-events-none{% endif %}">
|
||||
<div class="w-12 h-12 bg-brand-highlight/20 rounded-lg flex items-center justify-center mb-4">
|
||||
<svg class="w-6 h-6 text-brand-highlight" fill="none" stroke="currentColor" viewBox="0 0 24 24">
|
||||
<path stroke-linecap="round" stroke-linejoin="round" stroke-width="2"
|
||||
d="M7 2h7l5 5v13a2 2 0 01-2 2H7a2 2 0 01-2-2V4a2 2 0 012-2z"/>
|
||||
</svg>
|
||||
</div>
|
||||
<h2 class="text-lg font-semibold text-white mb-2">Aus Datei erstellen</h2>
|
||||
<p class="text-gray-400 text-sm">Datei hochladen und daraus einen Post generieren.</p>
|
||||
</a>
|
||||
</div>
|
||||
</div>
|
||||
{% endblock %}
|
||||
|
||||
@@ -41,6 +41,9 @@ from src.services.background_jobs import (
|
||||
)
|
||||
from src.services.db_job_manager import job_manager
|
||||
from src.services.storage_service import storage
|
||||
from src.services.link_extractor import LinkExtractor, LinkExtractionError
|
||||
from src.services.file_extractor import FileExtractor, FileExtractionError
|
||||
from src.agents.link_topic_builder import LinkTopicBuilderAgent
|
||||
|
||||
# Router for user frontend
|
||||
user_router = APIRouter(tags=["user"])
|
||||
@@ -1813,6 +1816,64 @@ async def create_post_page(request: Request):
|
||||
})
|
||||
|
||||
|
||||
@user_router.get("/create/link-wizard", response_class=HTMLResponse)
|
||||
async def create_post_link_page(request: Request):
|
||||
"""Create post from link wizard page."""
|
||||
session = require_user_session(request)
|
||||
if not session:
|
||||
return RedirectResponse(url="/login", status_code=302)
|
||||
|
||||
# Check token limit for companies/employees
|
||||
limit_reached = False
|
||||
limit_message = ""
|
||||
if session.account_type in ("company", "employee") and session.company_id:
|
||||
can_create, error_msg, _, _ = await db.check_company_token_limit(UUID(session.company_id))
|
||||
limit_reached = not can_create
|
||||
limit_message = error_msg
|
||||
|
||||
user_id = UUID(session.user_id)
|
||||
profile_picture = await get_user_avatar(session, user_id)
|
||||
|
||||
return templates.TemplateResponse("create_post_link.html", {
|
||||
"request": request,
|
||||
"page": "create",
|
||||
"session": session,
|
||||
"user_id": session.user_id,
|
||||
"limit_reached": limit_reached,
|
||||
"limit_message": limit_message,
|
||||
"profile_picture": profile_picture
|
||||
})
|
||||
|
||||
|
||||
@user_router.get("/create/file-wizard", response_class=HTMLResponse)
|
||||
async def create_post_file_page(request: Request):
|
||||
"""Create post from file wizard page."""
|
||||
session = require_user_session(request)
|
||||
if not session:
|
||||
return RedirectResponse(url="/login", status_code=302)
|
||||
|
||||
# Check token limit for companies/employees
|
||||
limit_reached = False
|
||||
limit_message = ""
|
||||
if session.account_type in ("company", "employee") and session.company_id:
|
||||
can_create, error_msg, _, _ = await db.check_company_token_limit(UUID(session.company_id))
|
||||
limit_reached = not can_create
|
||||
limit_message = error_msg
|
||||
|
||||
user_id = UUID(session.user_id)
|
||||
profile_picture = await get_user_avatar(session, user_id)
|
||||
|
||||
return templates.TemplateResponse("create_post_file.html", {
|
||||
"request": request,
|
||||
"page": "create",
|
||||
"session": session,
|
||||
"user_id": session.user_id,
|
||||
"limit_reached": limit_reached,
|
||||
"limit_message": limit_message,
|
||||
"profile_picture": profile_picture
|
||||
})
|
||||
|
||||
|
||||
@user_router.get("/chat-create", response_class=HTMLResponse)
|
||||
async def chat_create_page(request: Request):
|
||||
"""Chat-based post creation page."""
|
||||
@@ -2063,6 +2124,117 @@ async def transcribe_audio(request: Request):
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
|
||||
@user_router.post("/api/link-extract")
|
||||
async def extract_link(request: Request):
|
||||
"""Extract context from a link and build a structured topic."""
|
||||
session = require_user_session(request)
|
||||
if not session:
|
||||
raise HTTPException(status_code=401, detail="Not authenticated")
|
||||
|
||||
# Check token limit for companies/employees
|
||||
if session.account_type in ("company", "employee") and session.company_id:
|
||||
can_create, error_msg, _, _ = await db.check_company_token_limit(UUID(session.company_id))
|
||||
if not can_create:
|
||||
raise HTTPException(status_code=429, detail=error_msg)
|
||||
|
||||
try:
|
||||
data = await request.json()
|
||||
url = (data.get("url") or "").strip()
|
||||
transcript = (data.get("transcript") or "").strip()
|
||||
manual_title = (data.get("title") or "").strip()
|
||||
source_type = (data.get("source_type") or "").strip()
|
||||
source_url = (data.get("source_url") or "").strip()
|
||||
|
||||
if transcript:
|
||||
source = {
|
||||
"source_url": source_url or url,
|
||||
"source_type": source_type or "manual",
|
||||
"title": manual_title or "Manuelles Transkript",
|
||||
"text": transcript
|
||||
}
|
||||
else:
|
||||
extractor = LinkExtractor()
|
||||
try:
|
||||
source = await extractor.extract(url)
|
||||
except LinkExtractionError as exc:
|
||||
raise HTTPException(status_code=400, detail=str(exc)) from exc
|
||||
|
||||
builder = LinkTopicBuilderAgent()
|
||||
builder.set_tracking_context(
|
||||
operation="link_extract",
|
||||
user_id=session.user_id,
|
||||
company_id=session.company_id
|
||||
)
|
||||
topic = await builder.process(source)
|
||||
|
||||
return {"topic": topic, "source": source}
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.exception(f"Link extraction failed: {e}")
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
|
||||
@user_router.post("/api/file-extract")
|
||||
async def extract_file(request: Request):
|
||||
"""Extract context from an uploaded file and build a structured topic."""
|
||||
session = require_user_session(request)
|
||||
if not session:
|
||||
raise HTTPException(status_code=401, detail="Not authenticated")
|
||||
|
||||
# Check token limit for companies/employees
|
||||
if session.account_type in ("company", "employee") and session.company_id:
|
||||
can_create, error_msg, _, _ = await db.check_company_token_limit(UUID(session.company_id))
|
||||
if not can_create:
|
||||
raise HTTPException(status_code=429, detail=error_msg)
|
||||
|
||||
try:
|
||||
form = await request.form()
|
||||
upload: UploadFile = form.get("file") # type: ignore[assignment]
|
||||
if not upload:
|
||||
raise HTTPException(status_code=400, detail="Keine Datei hochgeladen.")
|
||||
|
||||
# Basic validation
|
||||
allowed_ext = {".pdf", ".docx", ".pptx", ".xlsx", ".txt", ".md", ".rtf"}
|
||||
filename = upload.filename or ""
|
||||
ext = Path(filename).suffix.lower()
|
||||
if not ext or ext not in allowed_ext:
|
||||
raise HTTPException(status_code=400, detail="Dateityp nicht unterstützt.")
|
||||
|
||||
file_bytes = await upload.read()
|
||||
max_bytes = 10 * 1024 * 1024 # 10 MB
|
||||
if len(file_bytes) > max_bytes:
|
||||
raise HTTPException(status_code=400, detail="Datei ist zu groß (max 10 MB).")
|
||||
|
||||
extractor = FileExtractor()
|
||||
try:
|
||||
text = extractor.extract_text(file_bytes, filename)
|
||||
except FileExtractionError as exc:
|
||||
raise HTTPException(status_code=400, detail=str(exc)) from exc
|
||||
|
||||
source = {
|
||||
"source_url": "",
|
||||
"source_type": "file",
|
||||
"title": filename or "Datei",
|
||||
"text": text
|
||||
}
|
||||
|
||||
builder = LinkTopicBuilderAgent()
|
||||
builder.set_tracking_context(
|
||||
operation="file_extract",
|
||||
user_id=session.user_id,
|
||||
company_id=session.company_id
|
||||
)
|
||||
topic = await builder.process(source)
|
||||
|
||||
return {"topic": topic, "source": source}
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.exception(f"File extraction failed: {e}")
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
|
||||
@user_router.post("/api/hooks")
|
||||
async def generate_hooks(
|
||||
request: Request,
|
||||
|
||||
Reference in New Issue
Block a user