new quality save layer

2026-02-12 14:17:36 +01:00
parent 7c69302b33
commit 1ebf50ab04
16 changed files with 2256 additions and 30 deletions
--- a/src/agents/profile_analyzer.py
+++ b/src/agents/profile_analyzer.py
@@ -1,6 +1,7 @@
 """Profile analyzer agent."""
 import json
 from typing import Dict, Any, List
+from collections import Counter
 from loguru import logger

 from src.agents.base import BaseAgent
@@ -61,7 +62,13 @@ class ProfileAnalyzerAgent(BaseAgent):

        # Parse JSON response
        analysis = json.loads(response)
-        logger.info("Profile analysis completed successfully")
+
+        # Add N-gram analysis
+        post_texts = [p.post_text for p in posts if p.post_text]
+        ngram_patterns = self._extract_ngram_patterns(post_texts)
+        analysis["ngram_patterns"] = ngram_patterns
+
+        logger.info("Profile analysis completed successfully (with N-gram analysis)")

        return analysis

@@ -298,3 +305,130 @@ Erstelle eine umfassende Analyse im folgenden JSON-Format:
 }}

 KRITISCH: Bei phrase_library und structure_templates müssen ECHTE, WÖRTLICHE Beispiele aus den Posts stehen! Keine generischen Beschreibungen!"""
+
+    def _extract_ngram_patterns(self, post_texts: List[str]) -> Dict[str, Any]:
+        """
+        Extract N-gram patterns from posts for better style fingerprinting.
+
+        Args:
+            post_texts: List of post texts
+
+        Returns:
+            Dictionary with bigrams, trigrams, and signature patterns
+        """
+        if not post_texts or len(post_texts) == 0:
+            return {
+                "typical_bigrams": [],
+                "typical_trigrams": [],
+                "signature_combinations": [],
+                "available": False
+            }
+
+        try:
+            import nltk
+            from nltk import ngrams, word_tokenize
+
+            # Download required NLTK data if not available
+            try:
+                nltk.data.find('tokenizers/punkt')
+            except LookupError:
+                logger.info("Downloading NLTK punkt tokenizer...")
+                nltk.download('punkt', quiet=True)
+
+            # Combine all text
+            all_text = " ".join(post_texts)
+
+            # Tokenize (try German tokenizer, fallback to default)
+            try:
+                words = word_tokenize(all_text.lower(), language='german')
+            except:
+                words = word_tokenize(all_text.lower())
+
+            # Remove punctuation and very short words
+            words = [w for w in words if w.isalnum() and len(w) > 2]
+
+            # Extract bigrams (2-word combinations)
+            bigrams = list(ngrams(words, 2))
+            bigram_freq = Counter(bigrams)
+
+            # Extract trigrams (3-word combinations)
+            trigrams = list(ngrams(words, 3))
+            trigram_freq = Counter(trigrams)
+
+            # Get top bigrams and trigrams
+            top_bigrams = [" ".join(bg) for bg, _ in bigram_freq.most_common(50)]
+            top_trigrams = [" ".join(tg) for tg, _ in trigram_freq.most_common(30)]
+
+            # Find signature patterns (unique combinations that appear multiple times)
+            signature_patterns = self._find_signature_patterns(bigram_freq, trigram_freq)
+
+            logger.info(f"Extracted {len(top_bigrams)} bigrams, {len(top_trigrams)} trigrams, "
+                       f"{len(signature_patterns)} signature patterns")
+
+            return {
+                "typical_bigrams": top_bigrams,
+                "typical_trigrams": top_trigrams,
+                "signature_combinations": signature_patterns,
+                "bigram_count": len(bigrams),
+                "trigram_count": len(trigrams),
+                "available": True
+            }
+
+        except Exception as e:
+            logger.error(f"N-gram extraction failed: {e}")
+            return {
+                "typical_bigrams": [],
+                "typical_trigrams": [],
+                "signature_combinations": [],
+                "available": False,
+                "error": str(e)
+            }
+
+    def _find_signature_patterns(
+        self,
+        bigram_freq: Counter,
+        trigram_freq: Counter
+    ) -> List[str]:
+        """
+        Find signature patterns - unique combinations that are not generic.
+
+        Args:
+            bigram_freq: Counter of bigram frequencies
+            trigram_freq: Counter of trigram frequencies
+
+        Returns:
+            List of signature pattern strings
+        """
+        # Generic German words to filter out
+        generic_words = {
+            'und', 'die', 'der', 'das', 'ist', 'sind', 'ein', 'eine',
+            'zu', 'den', 'für', 'mit', 'auf', 'von', 'in', 'des',
+            'dem', 'im', 'zum', 'zur', 'am', 'bei', 'hat', 'haben',
+            'wird', 'werden', 'kann', 'können', 'soll', 'müssen',
+            'auch', 'nur', 'noch', 'schon', 'sehr', 'mehr', 'aber',
+            'oder', 'wenn', 'dann', 'als', 'wie', 'nach', 'über'
+        }
+
+        signature_patterns = []
+
+        # Find non-generic bigrams that appear at least 3 times
+        for (w1, w2), count in bigram_freq.most_common(100):
+            if count >= 3:  # Must appear multiple times
+                # Filter out generic combinations
+                if w1 not in generic_words or w2 not in generic_words:
+                    pattern = f"{w1} {w2}"
+                    if pattern not in signature_patterns:
+                        signature_patterns.append(pattern)
+
+        # Find non-generic trigrams that appear at least 2 times
+        for (w1, w2, w3), count in trigram_freq.most_common(50):
+            if count >= 2:
+                # At least one word should not be generic
+                non_generic_count = sum(1 for w in [w1, w2, w3] if w not in generic_words)
+                if non_generic_count >= 2:
+                    pattern = f"{w1} {w2} {w3}"
+                    if pattern not in signature_patterns:
+                        signature_patterns.append(pattern)
+
+        # Limit to top 20 most distinctive patterns
+        return signature_patterns[:20]