Onyva-Postling/src/agents/profile_analyzer.py

"""Profile analyzer agent."""
import json
from typing import Dict, Any, List
from collections import Counter
from loguru import logger

from src.agents.base import BaseAgent
from src.database.models import LinkedInProfile, LinkedInPost


class ProfileAnalyzerAgent(BaseAgent):
    """Agent for analyzing LinkedIn profiles and extracting writing patterns."""

    def __init__(self):
        """Initialize profile analyzer agent."""
        super().__init__("ProfileAnalyzer")

    async def process(
        self,
        profile: LinkedInProfile,
        posts: List[LinkedInPost],
        customer_data: Dict[str, Any]
    ) -> Dict[str, Any]:
        """
        Analyze LinkedIn profile and extract writing patterns.

        Args:
            profile: LinkedIn profile data
            posts: List of LinkedIn posts
            customer_data: Additional customer data from input file

        Returns:
            Comprehensive profile analysis
        """
        logger.info(f"Analyzing profile for: {profile.name}")

        # Prepare analysis data
        profile_summary = {
            "name": profile.name,
            "headline": profile.headline,
            "summary": profile.summary,
            "industry": profile.industry,
            "location": profile.location
        }

        # Prepare posts with engagement data - use up to 30 posts
        posts_with_engagement = self._prepare_posts_for_analysis(posts[:15])

        # Also identify top performing posts by engagement
        top_posts = self._get_top_performing_posts(posts, limit=5)

        system_prompt = self._get_system_prompt()
        user_prompt = self._get_user_prompt(profile_summary, posts_with_engagement, top_posts, customer_data)

        response = await self.call_openai(
            system_prompt=system_prompt,
            user_prompt=user_prompt,
            model="gpt-4o",
            temperature=0.3,
            response_format={"type": "json_object"}
        )

        # Parse JSON response
        analysis = json.loads(response)

        # Add N-gram analysis
        post_texts = [p.post_text for p in posts if p.post_text]
        ngram_patterns = self._extract_ngram_patterns(post_texts)
        analysis["ngram_patterns"] = ngram_patterns

        logger.info("Profile analysis completed successfully (with N-gram analysis)")

        return analysis

    def _prepare_posts_for_analysis(self, posts: List[LinkedInPost]) -> List[Dict[str, Any]]:
        """Prepare posts with engagement data for analysis."""
        prepared = []
        for i, post in enumerate(posts):
            if not post.post_text:
                continue
            prepared.append({
                "index": i + 1,
                "text": post.post_text,
                "likes": post.likes or 0,
                "comments": post.comments or 0,
                "shares": post.shares or 0,
                "engagement_total": (post.likes or 0) + (post.comments or 0) * 2 + (post.shares or 0) * 3
            })
        return prepared

    def _get_top_performing_posts(self, posts: List[LinkedInPost], limit: int = 5) -> List[Dict[str, Any]]:
        """Get top performing posts by engagement."""
        posts_with_engagement = []
        for post in posts:
            if not post.post_text or len(post.post_text) < 50:
                continue
            engagement = (post.likes or 0) + (post.comments or 0) * 2 + (post.shares or 0) * 3
            posts_with_engagement.append({
                "text": post.post_text,
                "likes": post.likes or 0,
                "comments": post.comments or 0,
                "shares": post.shares or 0,
                "engagement_score": engagement
            })

        # Sort by engagement and return top posts
        sorted_posts = sorted(posts_with_engagement, key=lambda x: x["engagement_score"], reverse=True)
        return sorted_posts[:limit]

    def _get_system_prompt(self) -> str:
        """Get system prompt for profile analysis."""
        return """Du bist ein hochspezialisierter AI-Analyst für LinkedIn-Profile und Content-Strategie.

Deine Aufgabe ist es, aus LinkedIn-Profildaten und Posts ein umfassendes Content-Analyse-Profil zu erstellen, das als BLAUPAUSE für das Schreiben neuer Posts dient.

WICHTIG: Extrahiere ECHTE BEISPIELE aus den Posts! Keine generischen Beschreibungen.

Das Profil soll folgende Dimensionen analysieren:

1. **Schreibstil & Tonalität**
   - Wie schreibt die Person? (formal, locker, inspirierend, provokativ, etc.)
   - Welche Perspektive wird genutzt? (Ich, Wir, Man)
   - Wie ist die Ansprache? (Du, Sie, neutral)
   - Satzdynamik und Rhythmus

2. **Phrasen-Bibliothek (KRITISCH!)**
   - Hook-Phrasen: Wie beginnen Posts? Extrahiere 5-10 ECHTE Beispiele!
   - Übergangs-Phrasen: Wie werden Absätze verbunden?
   - Emotionale Ausdrücke: Ausrufe, Begeisterung, etc.
   - CTA-Phrasen: Wie werden Leser aktiviert?
   - Signature Phrases: Wiederkehrende Markenzeichen

3. **Struktur-Templates**
   - Analysiere die STRUKTUR der Top-Posts
   - Erstelle 2-3 konkrete Templates (z.B. "Hook → Flashback → Erkenntnis → CTA")
   - Typische Satzanfänge für jeden Abschnitt

4. **Visuelle Muster**
   - Emoji-Nutzung (welche, wo, wie oft)
   - Unicode-Formatierung (fett, kursiv)
   - Strukturierung (Absätze, Listen, etc.)

5. **Audience Insights**
   - Wer ist die Zielgruppe?
   - Welche Probleme werden adressiert?
   - Welcher Mehrwert wird geboten?

Gib deine Analyse als strukturiertes JSON zurück."""

    def _get_user_prompt(
        self,
        profile_summary: Dict[str, Any],
        posts_with_engagement: List[Dict[str, Any]],
        top_posts: List[Dict[str, Any]],
        customer_data: Dict[str, Any]
    ) -> str:
        """Get user prompt with data for analysis."""
        # Format all posts with engagement data
        all_posts_text = ""
        for post in posts_with_engagement:
            all_posts_text += f"\n--- Post {post['index']} (Likes: {post['likes']}, Comments: {post['comments']}, Shares: {post['shares']}) ---\n"
            all_posts_text += post['text'][:2000]  # Limit each post to 2000 chars
            all_posts_text += "\n"

        # Format top performing posts
        top_posts_text = ""
        if top_posts:
            for i, post in enumerate(top_posts, 1):
                top_posts_text += f"\n--- TOP POST {i} (Engagement Score: {post['engagement_score']}, Likes: {post['likes']}, Comments: {post['comments']}) ---\n"
                top_posts_text += post['text'][:2000]
                top_posts_text += "\n"

        return f"""Bitte analysiere folgendes LinkedIn-Profil BASIEREND AUF DEN ECHTEN POSTS:

**PROFIL-INFORMATIONEN:**
- Name: {profile_summary.get('name', 'N/A')}
- Headline: {profile_summary.get('headline', 'N/A')}
- Branche: {profile_summary.get('industry', 'N/A')}
- Location: {profile_summary.get('location', 'N/A')}
- Summary: {profile_summary.get('summary', 'N/A')}

**ZUSÄTZLICHE KUNDENDATEN (Persona, Style Guide, etc.):**
{json.dumps(customer_data, indent=2, ensure_ascii=False)}

**TOP-PERFORMING POSTS (die erfolgreichsten Posts - ANALYSIERE DIESE BESONDERS GENAU!):**
{top_posts_text if top_posts_text else "Keine Engagement-Daten verfügbar"}

**ALLE POSTS ({len(posts_with_engagement)} Posts mit Engagement-Daten):**
{all_posts_text}

---

WICHTIG: Analysiere die ECHTEN POSTS sehr genau! Deine Analyse muss auf den tatsächlichen Mustern basieren, nicht auf Annahmen. Extrahiere WÖRTLICHE ZITATE wo möglich!

Achte besonders auf:
1. Die TOP-PERFORMING Posts - was macht sie erfolgreich?
2. Wiederkehrende Phrasen und Formulierungen - WÖRTLICH extrahieren!
3. Wie beginnen die Posts (Hooks)? - ECHTE BEISPIELE sammeln!
4. Wie enden die Posts (CTAs)?
5. Emoji-Verwendung (welche, wo, wie oft)
6. Länge und Struktur der Absätze
7. Typische Satzanfänge und Übergänge

Erstelle eine umfassende Analyse im folgenden JSON-Format:

{{
  "writing_style": {{
    "tone": "Beschreibung der Tonalität basierend auf den echten Posts",
    "perspective": "Ich/Wir/Man/Gemischt - mit Beispielen aus den Posts",
    "form_of_address": "Du/Sie/Neutral - wie spricht die Person die Leser an?",
    "sentence_dynamics": "Kurze Sätze? Lange Sätze? Mischung? Fragen?",
    "average_post_length": "Kurz/Mittel/Lang",
    "average_word_count": 0
  }},
  "linguistic_fingerprint": {{
    "energy_level": 0,
    "shouting_usage": "Beschreibung mit konkreten Beispielen aus den Posts",
    "punctuation_patterns": "Beschreibung (!!!, ..., ?, etc.)",
    "signature_phrases": ["ECHTE Phrasen aus den Posts", "die wiederholt vorkommen"],
    "narrative_anchors": ["Storytelling-Elemente", "die die Person nutzt"]
  }},
  "phrase_library": {{
    "hook_phrases": [
      "ECHTE Hook-Sätze aus den Posts wörtlich kopiert",
      "Mindestens 5-8 verschiedene Beispiele",
      "z.B. '𝗞𝗜-𝗦𝘂𝗰𝗵𝗲 𝗶𝘀𝘁 𝗱𝗲𝗿 𝗲𝗿𝘀𝘁𝗲 𝗦𝗰𝗵𝗿𝗶𝘁𝘁 𝗶𝗺 𝗦𝗮𝗹𝗲𝘀 𝗙𝘂𝗻𝗻𝗲𝗹.'"
    ],
    "transition_phrases": [
      "ECHTE Übergangssätze zwischen Absätzen",
      "z.B. 'Und wisst ihr was?', 'Aber Moment...', 'Was das mit X zu tun hat?'"
    ],
    "emotional_expressions": [
      "Ausrufe und emotionale Marker",
      "z.B. 'Halleluja!', 'Sorry to say!!', 'Galopp!!!!'"
    ],
    "cta_phrases": [
      "ECHTE Call-to-Action Formulierungen",
      "z.B. 'Was denkt ihr?', 'Seid ihr dabei?', 'Lasst uns darüber sprechen.'"
    ],
    "filler_expressions": [
      "Typische Füllwörter und Ausdrücke",
      "z.B. 'Ich meine...', 'Wisst ihr...', 'Ok, ok...'"
    ]
  }},
  "structure_templates": {{
    "primary_structure": "Die häufigste Struktur beschreiben, z.B. 'Unicode-Hook → Persönliche Anekdote → Erkenntnis → Bullet Points → CTA'",
    "template_examples": [
      {{
        "name": "Storytelling-Post",
        "structure": ["Fetter Hook mit Zitat", "Flashback/Anekdote", "Erkenntnis/Lesson", "Praktische Tipps", "CTA-Frage"],
        "example_post_index": 1
      }},
      {{
        "name": "Insight-Post",
        "structure": ["Provokante These", "Begründung", "Beispiel", "Handlungsaufforderung"],
        "example_post_index": 2
      }}
    ],
    "typical_sentence_starters": [
      "ECHTE Satzanfänge aus den Posts",
      "z.B. 'Ich glaube, dass...', 'Was mir aufgefallen ist...', 'Das Verrückte ist...'"
    ],
    "paragraph_transitions": [
      "Wie werden Absätze eingeleitet?",
      "z.B. 'Und...', 'Aber:', 'Das bedeutet:'"
    ]
  }},
  "tone_analysis": {{
    "primary_tone": "Haupttonalität basierend auf den Posts",
    "emotional_range": "Welche Emotionen werden angesprochen?",
    "authenticity_markers": ["Was macht den Stil einzigartig?", "Erkennbare Merkmale"]
  }},
  "topic_patterns": {{
    "main_topics": ["Hauptthemen aus den Posts"],
    "content_pillars": ["Content-Säulen"],
    "expertise_areas": ["Expertise-Bereiche"],
    "expertise_level": "Anfänger/Fortgeschritten/Experte"
  }},
  "audience_insights": {{
    "target_audience": "Wer wird angesprochen?",
    "pain_points_addressed": ["Probleme die adressiert werden"],
    "value_proposition": "Welchen Mehrwert bietet die Person?",
    "industry_context": "Branchenkontext"
  }},
  "visual_patterns": {{
    "emoji_usage": {{
      "emojis": ["Liste der tatsächlich verwendeten Emojis"],
      "placement": "Anfang/Ende/Inline/Zwischen Absätzen",
      "frequency": "Selten/Mittel/Häufig - pro Post durchschnittlich X"
    }},
    "unicode_formatting": "Wird ✓, →, •, 𝗙𝗲𝘁𝘁 etc. verwendet? Wo?",
    "structure_preferences": "Absätze/Listen/Einzeiler/Nummeriert"
  }},
  "content_strategy": {{
    "hook_patterns": "Wie werden Posts KONKRET eröffnet? Beschreibung des Musters",
    "cta_style": "Wie sehen die CTAs aus? Frage? Aufforderung? Keine?",
    "storytelling_approach": "Persönliche Geschichten? Metaphern? Case Studies?",
    "post_structure": "Hook → Body → CTA? Oder anders?"
  }},
  "best_performing_patterns": {{
    "what_works": "Was machen die Top-Posts anders/besser?",
    "successful_hooks": ["WÖRTLICHE Beispiel-Hooks aus Top-Posts"],
    "engagement_drivers": ["Was treibt Engagement?"]
  }}
}}

KRITISCH: Bei phrase_library und structure_templates müssen ECHTE, WÖRTLICHE Beispiele aus den Posts stehen! Keine generischen Beschreibungen!"""

    def _extract_ngram_patterns(self, post_texts: List[str]) -> Dict[str, Any]:
        """
        Extract N-gram patterns from posts for better style fingerprinting.

        Args:
            post_texts: List of post texts

        Returns:
            Dictionary with bigrams, trigrams, and signature patterns
        """
        if not post_texts or len(post_texts) == 0:
            return {
                "typical_bigrams": [],
                "typical_trigrams": [],
                "signature_combinations": [],
                "available": False
            }

        try:
            import nltk
            from nltk import ngrams, word_tokenize

            # Download required NLTK data if not available
            try:
                nltk.data.find('tokenizers/punkt')
            except LookupError:
                logger.info("Downloading NLTK punkt tokenizer...")
                nltk.download('punkt', quiet=True)

            # Combine all text
            all_text = " ".join(post_texts)

            # Tokenize (try German tokenizer, fallback to default)
            try:
                words = word_tokenize(all_text.lower(), language='german')
            except:
                words = word_tokenize(all_text.lower())

            # Remove punctuation and very short words
            words = [w for w in words if w.isalnum() and len(w) > 2]

            # Extract bigrams (2-word combinations)
            bigrams = list(ngrams(words, 2))
            bigram_freq = Counter(bigrams)

            # Extract trigrams (3-word combinations)
            trigrams = list(ngrams(words, 3))
            trigram_freq = Counter(trigrams)

            # Get top bigrams and trigrams
            top_bigrams = [" ".join(bg) for bg, _ in bigram_freq.most_common(50)]
            top_trigrams = [" ".join(tg) for tg, _ in trigram_freq.most_common(30)]

            # Find signature patterns (unique combinations that appear multiple times)
            signature_patterns = self._find_signature_patterns(bigram_freq, trigram_freq)

            logger.info(f"Extracted {len(top_bigrams)} bigrams, {len(top_trigrams)} trigrams, "
                       f"{len(signature_patterns)} signature patterns")

            return {
                "typical_bigrams": top_bigrams,
                "typical_trigrams": top_trigrams,
                "signature_combinations": signature_patterns,
                "bigram_count": len(bigrams),
                "trigram_count": len(trigrams),
                "available": True
            }

        except Exception as e:
            logger.error(f"N-gram extraction failed: {e}")
            return {
                "typical_bigrams": [],
                "typical_trigrams": [],
                "signature_combinations": [],
                "available": False,
                "error": str(e)
            }

    def _find_signature_patterns(
        self,
        bigram_freq: Counter,
        trigram_freq: Counter
    ) -> List[str]:
        """
        Find signature patterns - unique combinations that are not generic.

        Args:
            bigram_freq: Counter of bigram frequencies
            trigram_freq: Counter of trigram frequencies

        Returns:
            List of signature pattern strings
        """
        # Generic German words to filter out
        generic_words = {
            'und', 'die', 'der', 'das', 'ist', 'sind', 'ein', 'eine',
            'zu', 'den', 'für', 'mit', 'auf', 'von', 'in', 'des',
            'dem', 'im', 'zum', 'zur', 'am', 'bei', 'hat', 'haben',
            'wird', 'werden', 'kann', 'können', 'soll', 'müssen',
            'auch', 'nur', 'noch', 'schon', 'sehr', 'mehr', 'aber',
            'oder', 'wenn', 'dann', 'als', 'wie', 'nach', 'über'
        }

        signature_patterns = []

        # Find non-generic bigrams that appear at least 3 times
        for (w1, w2), count in bigram_freq.most_common(100):
            if count >= 3:  # Must appear multiple times
                # Filter out generic combinations
                if w1 not in generic_words or w2 not in generic_words:
                    pattern = f"{w1} {w2}"
                    if pattern not in signature_patterns:
                        signature_patterns.append(pattern)

        # Find non-generic trigrams that appear at least 2 times
        for (w1, w2, w3), count in trigram_freq.most_common(50):
            if count >= 2:
                # At least one word should not be generic
                non_generic_count = sum(1 for w in [w1, w2, w3] if w not in generic_words)
                if non_generic_count >= 2:
                    pattern = f"{w1} {w2} {w3}"
                    if pattern not in signature_patterns:
                        signature_patterns.append(pattern)

        # Limit to top 20 most distinctive patterns
        return signature_patterns[:20]