Files
Onyva-Postling/src/agents/profile_analyzer.py
2026-02-12 14:17:36 +01:00

435 lines
17 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""Profile analyzer agent."""
import json
from typing import Dict, Any, List
from collections import Counter
from loguru import logger
from src.agents.base import BaseAgent
from src.database.models import LinkedInProfile, LinkedInPost
class ProfileAnalyzerAgent(BaseAgent):
"""Agent for analyzing LinkedIn profiles and extracting writing patterns."""
def __init__(self):
"""Initialize profile analyzer agent."""
super().__init__("ProfileAnalyzer")
async def process(
self,
profile: LinkedInProfile,
posts: List[LinkedInPost],
customer_data: Dict[str, Any]
) -> Dict[str, Any]:
"""
Analyze LinkedIn profile and extract writing patterns.
Args:
profile: LinkedIn profile data
posts: List of LinkedIn posts
customer_data: Additional customer data from input file
Returns:
Comprehensive profile analysis
"""
logger.info(f"Analyzing profile for: {profile.name}")
# Prepare analysis data
profile_summary = {
"name": profile.name,
"headline": profile.headline,
"summary": profile.summary,
"industry": profile.industry,
"location": profile.location
}
# Prepare posts with engagement data - use up to 30 posts
posts_with_engagement = self._prepare_posts_for_analysis(posts[:15])
# Also identify top performing posts by engagement
top_posts = self._get_top_performing_posts(posts, limit=5)
system_prompt = self._get_system_prompt()
user_prompt = self._get_user_prompt(profile_summary, posts_with_engagement, top_posts, customer_data)
response = await self.call_openai(
system_prompt=system_prompt,
user_prompt=user_prompt,
model="gpt-4o",
temperature=0.3,
response_format={"type": "json_object"}
)
# Parse JSON response
analysis = json.loads(response)
# Add N-gram analysis
post_texts = [p.post_text for p in posts if p.post_text]
ngram_patterns = self._extract_ngram_patterns(post_texts)
analysis["ngram_patterns"] = ngram_patterns
logger.info("Profile analysis completed successfully (with N-gram analysis)")
return analysis
def _prepare_posts_for_analysis(self, posts: List[LinkedInPost]) -> List[Dict[str, Any]]:
"""Prepare posts with engagement data for analysis."""
prepared = []
for i, post in enumerate(posts):
if not post.post_text:
continue
prepared.append({
"index": i + 1,
"text": post.post_text,
"likes": post.likes or 0,
"comments": post.comments or 0,
"shares": post.shares or 0,
"engagement_total": (post.likes or 0) + (post.comments or 0) * 2 + (post.shares or 0) * 3
})
return prepared
def _get_top_performing_posts(self, posts: List[LinkedInPost], limit: int = 5) -> List[Dict[str, Any]]:
"""Get top performing posts by engagement."""
posts_with_engagement = []
for post in posts:
if not post.post_text or len(post.post_text) < 50:
continue
engagement = (post.likes or 0) + (post.comments or 0) * 2 + (post.shares or 0) * 3
posts_with_engagement.append({
"text": post.post_text,
"likes": post.likes or 0,
"comments": post.comments or 0,
"shares": post.shares or 0,
"engagement_score": engagement
})
# Sort by engagement and return top posts
sorted_posts = sorted(posts_with_engagement, key=lambda x: x["engagement_score"], reverse=True)
return sorted_posts[:limit]
def _get_system_prompt(self) -> str:
"""Get system prompt for profile analysis."""
return """Du bist ein hochspezialisierter AI-Analyst für LinkedIn-Profile und Content-Strategie.
Deine Aufgabe ist es, aus LinkedIn-Profildaten und Posts ein umfassendes Content-Analyse-Profil zu erstellen, das als BLAUPAUSE für das Schreiben neuer Posts dient.
WICHTIG: Extrahiere ECHTE BEISPIELE aus den Posts! Keine generischen Beschreibungen.
Das Profil soll folgende Dimensionen analysieren:
1. **Schreibstil & Tonalität**
- Wie schreibt die Person? (formal, locker, inspirierend, provokativ, etc.)
- Welche Perspektive wird genutzt? (Ich, Wir, Man)
- Wie ist die Ansprache? (Du, Sie, neutral)
- Satzdynamik und Rhythmus
2. **Phrasen-Bibliothek (KRITISCH!)**
- Hook-Phrasen: Wie beginnen Posts? Extrahiere 5-10 ECHTE Beispiele!
- Übergangs-Phrasen: Wie werden Absätze verbunden?
- Emotionale Ausdrücke: Ausrufe, Begeisterung, etc.
- CTA-Phrasen: Wie werden Leser aktiviert?
- Signature Phrases: Wiederkehrende Markenzeichen
3. **Struktur-Templates**
- Analysiere die STRUKTUR der Top-Posts
- Erstelle 2-3 konkrete Templates (z.B. "Hook → Flashback → Erkenntnis → CTA")
- Typische Satzanfänge für jeden Abschnitt
4. **Visuelle Muster**
- Emoji-Nutzung (welche, wo, wie oft)
- Unicode-Formatierung (fett, kursiv)
- Strukturierung (Absätze, Listen, etc.)
5. **Audience Insights**
- Wer ist die Zielgruppe?
- Welche Probleme werden adressiert?
- Welcher Mehrwert wird geboten?
Gib deine Analyse als strukturiertes JSON zurück."""
def _get_user_prompt(
self,
profile_summary: Dict[str, Any],
posts_with_engagement: List[Dict[str, Any]],
top_posts: List[Dict[str, Any]],
customer_data: Dict[str, Any]
) -> str:
"""Get user prompt with data for analysis."""
# Format all posts with engagement data
all_posts_text = ""
for post in posts_with_engagement:
all_posts_text += f"\n--- Post {post['index']} (Likes: {post['likes']}, Comments: {post['comments']}, Shares: {post['shares']}) ---\n"
all_posts_text += post['text'][:2000] # Limit each post to 2000 chars
all_posts_text += "\n"
# Format top performing posts
top_posts_text = ""
if top_posts:
for i, post in enumerate(top_posts, 1):
top_posts_text += f"\n--- TOP POST {i} (Engagement Score: {post['engagement_score']}, Likes: {post['likes']}, Comments: {post['comments']}) ---\n"
top_posts_text += post['text'][:2000]
top_posts_text += "\n"
return f"""Bitte analysiere folgendes LinkedIn-Profil BASIEREND AUF DEN ECHTEN POSTS:
**PROFIL-INFORMATIONEN:**
- Name: {profile_summary.get('name', 'N/A')}
- Headline: {profile_summary.get('headline', 'N/A')}
- Branche: {profile_summary.get('industry', 'N/A')}
- Location: {profile_summary.get('location', 'N/A')}
- Summary: {profile_summary.get('summary', 'N/A')}
**ZUSÄTZLICHE KUNDENDATEN (Persona, Style Guide, etc.):**
{json.dumps(customer_data, indent=2, ensure_ascii=False)}
**TOP-PERFORMING POSTS (die erfolgreichsten Posts - ANALYSIERE DIESE BESONDERS GENAU!):**
{top_posts_text if top_posts_text else "Keine Engagement-Daten verfügbar"}
**ALLE POSTS ({len(posts_with_engagement)} Posts mit Engagement-Daten):**
{all_posts_text}
---
WICHTIG: Analysiere die ECHTEN POSTS sehr genau! Deine Analyse muss auf den tatsächlichen Mustern basieren, nicht auf Annahmen. Extrahiere WÖRTLICHE ZITATE wo möglich!
Achte besonders auf:
1. Die TOP-PERFORMING Posts - was macht sie erfolgreich?
2. Wiederkehrende Phrasen und Formulierungen - WÖRTLICH extrahieren!
3. Wie beginnen die Posts (Hooks)? - ECHTE BEISPIELE sammeln!
4. Wie enden die Posts (CTAs)?
5. Emoji-Verwendung (welche, wo, wie oft)
6. Länge und Struktur der Absätze
7. Typische Satzanfänge und Übergänge
Erstelle eine umfassende Analyse im folgenden JSON-Format:
{{
"writing_style": {{
"tone": "Beschreibung der Tonalität basierend auf den echten Posts",
"perspective": "Ich/Wir/Man/Gemischt - mit Beispielen aus den Posts",
"form_of_address": "Du/Sie/Neutral - wie spricht die Person die Leser an?",
"sentence_dynamics": "Kurze Sätze? Lange Sätze? Mischung? Fragen?",
"average_post_length": "Kurz/Mittel/Lang",
"average_word_count": 0
}},
"linguistic_fingerprint": {{
"energy_level": 0,
"shouting_usage": "Beschreibung mit konkreten Beispielen aus den Posts",
"punctuation_patterns": "Beschreibung (!!!, ..., ?, etc.)",
"signature_phrases": ["ECHTE Phrasen aus den Posts", "die wiederholt vorkommen"],
"narrative_anchors": ["Storytelling-Elemente", "die die Person nutzt"]
}},
"phrase_library": {{
"hook_phrases": [
"ECHTE Hook-Sätze aus den Posts wörtlich kopiert",
"Mindestens 5-8 verschiedene Beispiele",
"z.B. '𝗞𝗜-𝗦𝘂𝗰𝗵𝗲 𝗶𝘀𝘁 𝗱𝗲𝗿 𝗲𝗿𝘀𝘁𝗲 𝗦𝗰𝗵𝗿𝗶𝘁𝘁 𝗶𝗺 𝗦𝗮𝗹𝗲𝘀 𝗙𝘂𝗻𝗻𝗲𝗹.'"
],
"transition_phrases": [
"ECHTE Übergangssätze zwischen Absätzen",
"z.B. 'Und wisst ihr was?', 'Aber Moment...', 'Was das mit X zu tun hat?'"
],
"emotional_expressions": [
"Ausrufe und emotionale Marker",
"z.B. 'Halleluja!', 'Sorry to say!!', 'Galopp!!!!'"
],
"cta_phrases": [
"ECHTE Call-to-Action Formulierungen",
"z.B. 'Was denkt ihr?', 'Seid ihr dabei?', 'Lasst uns darüber sprechen.'"
],
"filler_expressions": [
"Typische Füllwörter und Ausdrücke",
"z.B. 'Ich meine...', 'Wisst ihr...', 'Ok, ok...'"
]
}},
"structure_templates": {{
"primary_structure": "Die häufigste Struktur beschreiben, z.B. 'Unicode-Hook → Persönliche Anekdote → Erkenntnis → Bullet Points → CTA'",
"template_examples": [
{{
"name": "Storytelling-Post",
"structure": ["Fetter Hook mit Zitat", "Flashback/Anekdote", "Erkenntnis/Lesson", "Praktische Tipps", "CTA-Frage"],
"example_post_index": 1
}},
{{
"name": "Insight-Post",
"structure": ["Provokante These", "Begründung", "Beispiel", "Handlungsaufforderung"],
"example_post_index": 2
}}
],
"typical_sentence_starters": [
"ECHTE Satzanfänge aus den Posts",
"z.B. 'Ich glaube, dass...', 'Was mir aufgefallen ist...', 'Das Verrückte ist...'"
],
"paragraph_transitions": [
"Wie werden Absätze eingeleitet?",
"z.B. 'Und...', 'Aber:', 'Das bedeutet:'"
]
}},
"tone_analysis": {{
"primary_tone": "Haupttonalität basierend auf den Posts",
"emotional_range": "Welche Emotionen werden angesprochen?",
"authenticity_markers": ["Was macht den Stil einzigartig?", "Erkennbare Merkmale"]
}},
"topic_patterns": {{
"main_topics": ["Hauptthemen aus den Posts"],
"content_pillars": ["Content-Säulen"],
"expertise_areas": ["Expertise-Bereiche"],
"expertise_level": "Anfänger/Fortgeschritten/Experte"
}},
"audience_insights": {{
"target_audience": "Wer wird angesprochen?",
"pain_points_addressed": ["Probleme die adressiert werden"],
"value_proposition": "Welchen Mehrwert bietet die Person?",
"industry_context": "Branchenkontext"
}},
"visual_patterns": {{
"emoji_usage": {{
"emojis": ["Liste der tatsächlich verwendeten Emojis"],
"placement": "Anfang/Ende/Inline/Zwischen Absätzen",
"frequency": "Selten/Mittel/Häufig - pro Post durchschnittlich X"
}},
"unicode_formatting": "Wird ✓, →, •, 𝗙𝗲𝘁𝘁 etc. verwendet? Wo?",
"structure_preferences": "Absätze/Listen/Einzeiler/Nummeriert"
}},
"content_strategy": {{
"hook_patterns": "Wie werden Posts KONKRET eröffnet? Beschreibung des Musters",
"cta_style": "Wie sehen die CTAs aus? Frage? Aufforderung? Keine?",
"storytelling_approach": "Persönliche Geschichten? Metaphern? Case Studies?",
"post_structure": "Hook → Body → CTA? Oder anders?"
}},
"best_performing_patterns": {{
"what_works": "Was machen die Top-Posts anders/besser?",
"successful_hooks": ["WÖRTLICHE Beispiel-Hooks aus Top-Posts"],
"engagement_drivers": ["Was treibt Engagement?"]
}}
}}
KRITISCH: Bei phrase_library und structure_templates müssen ECHTE, WÖRTLICHE Beispiele aus den Posts stehen! Keine generischen Beschreibungen!"""
def _extract_ngram_patterns(self, post_texts: List[str]) -> Dict[str, Any]:
"""
Extract N-gram patterns from posts for better style fingerprinting.
Args:
post_texts: List of post texts
Returns:
Dictionary with bigrams, trigrams, and signature patterns
"""
if not post_texts or len(post_texts) == 0:
return {
"typical_bigrams": [],
"typical_trigrams": [],
"signature_combinations": [],
"available": False
}
try:
import nltk
from nltk import ngrams, word_tokenize
# Download required NLTK data if not available
try:
nltk.data.find('tokenizers/punkt')
except LookupError:
logger.info("Downloading NLTK punkt tokenizer...")
nltk.download('punkt', quiet=True)
# Combine all text
all_text = " ".join(post_texts)
# Tokenize (try German tokenizer, fallback to default)
try:
words = word_tokenize(all_text.lower(), language='german')
except:
words = word_tokenize(all_text.lower())
# Remove punctuation and very short words
words = [w for w in words if w.isalnum() and len(w) > 2]
# Extract bigrams (2-word combinations)
bigrams = list(ngrams(words, 2))
bigram_freq = Counter(bigrams)
# Extract trigrams (3-word combinations)
trigrams = list(ngrams(words, 3))
trigram_freq = Counter(trigrams)
# Get top bigrams and trigrams
top_bigrams = [" ".join(bg) for bg, _ in bigram_freq.most_common(50)]
top_trigrams = [" ".join(tg) for tg, _ in trigram_freq.most_common(30)]
# Find signature patterns (unique combinations that appear multiple times)
signature_patterns = self._find_signature_patterns(bigram_freq, trigram_freq)
logger.info(f"Extracted {len(top_bigrams)} bigrams, {len(top_trigrams)} trigrams, "
f"{len(signature_patterns)} signature patterns")
return {
"typical_bigrams": top_bigrams,
"typical_trigrams": top_trigrams,
"signature_combinations": signature_patterns,
"bigram_count": len(bigrams),
"trigram_count": len(trigrams),
"available": True
}
except Exception as e:
logger.error(f"N-gram extraction failed: {e}")
return {
"typical_bigrams": [],
"typical_trigrams": [],
"signature_combinations": [],
"available": False,
"error": str(e)
}
def _find_signature_patterns(
self,
bigram_freq: Counter,
trigram_freq: Counter
) -> List[str]:
"""
Find signature patterns - unique combinations that are not generic.
Args:
bigram_freq: Counter of bigram frequencies
trigram_freq: Counter of trigram frequencies
Returns:
List of signature pattern strings
"""
# Generic German words to filter out
generic_words = {
'und', 'die', 'der', 'das', 'ist', 'sind', 'ein', 'eine',
'zu', 'den', 'für', 'mit', 'auf', 'von', 'in', 'des',
'dem', 'im', 'zum', 'zur', 'am', 'bei', 'hat', 'haben',
'wird', 'werden', 'kann', 'können', 'soll', 'müssen',
'auch', 'nur', 'noch', 'schon', 'sehr', 'mehr', 'aber',
'oder', 'wenn', 'dann', 'als', 'wie', 'nach', 'über'
}
signature_patterns = []
# Find non-generic bigrams that appear at least 3 times
for (w1, w2), count in bigram_freq.most_common(100):
if count >= 3: # Must appear multiple times
# Filter out generic combinations
if w1 not in generic_words or w2 not in generic_words:
pattern = f"{w1} {w2}"
if pattern not in signature_patterns:
signature_patterns.append(pattern)
# Find non-generic trigrams that appear at least 2 times
for (w1, w2, w3), count in trigram_freq.most_common(50):
if count >= 2:
# At least one word should not be generic
non_generic_count = sum(1 for w in [w1, w2, w3] if w not in generic_words)
if non_generic_count >= 2:
pattern = f"{w1} {w2} {w3}"
if pattern not in signature_patterns:
signature_patterns.append(pattern)
# Limit to top 20 most distinctive patterns
return signature_patterns[:20]