new quality save layer

This commit is contained in:
2026-02-12 14:17:36 +01:00
parent 7c69302b33
commit 1ebf50ab04
16 changed files with 2256 additions and 30 deletions

View File

@@ -1,6 +1,7 @@
"""Profile analyzer agent."""
import json
from typing import Dict, Any, List
from collections import Counter
from loguru import logger
from src.agents.base import BaseAgent
@@ -61,7 +62,13 @@ class ProfileAnalyzerAgent(BaseAgent):
# Parse JSON response
analysis = json.loads(response)
logger.info("Profile analysis completed successfully")
# Add N-gram analysis
post_texts = [p.post_text for p in posts if p.post_text]
ngram_patterns = self._extract_ngram_patterns(post_texts)
analysis["ngram_patterns"] = ngram_patterns
logger.info("Profile analysis completed successfully (with N-gram analysis)")
return analysis
@@ -298,3 +305,130 @@ Erstelle eine umfassende Analyse im folgenden JSON-Format:
}}
KRITISCH: Bei phrase_library und structure_templates müssen ECHTE, WÖRTLICHE Beispiele aus den Posts stehen! Keine generischen Beschreibungen!"""
def _extract_ngram_patterns(self, post_texts: List[str]) -> Dict[str, Any]:
"""
Extract N-gram patterns from posts for better style fingerprinting.
Args:
post_texts: List of post texts
Returns:
Dictionary with bigrams, trigrams, and signature patterns
"""
if not post_texts or len(post_texts) == 0:
return {
"typical_bigrams": [],
"typical_trigrams": [],
"signature_combinations": [],
"available": False
}
try:
import nltk
from nltk import ngrams, word_tokenize
# Download required NLTK data if not available
try:
nltk.data.find('tokenizers/punkt')
except LookupError:
logger.info("Downloading NLTK punkt tokenizer...")
nltk.download('punkt', quiet=True)
# Combine all text
all_text = " ".join(post_texts)
# Tokenize (try German tokenizer, fallback to default)
try:
words = word_tokenize(all_text.lower(), language='german')
except:
words = word_tokenize(all_text.lower())
# Remove punctuation and very short words
words = [w for w in words if w.isalnum() and len(w) > 2]
# Extract bigrams (2-word combinations)
bigrams = list(ngrams(words, 2))
bigram_freq = Counter(bigrams)
# Extract trigrams (3-word combinations)
trigrams = list(ngrams(words, 3))
trigram_freq = Counter(trigrams)
# Get top bigrams and trigrams
top_bigrams = [" ".join(bg) for bg, _ in bigram_freq.most_common(50)]
top_trigrams = [" ".join(tg) for tg, _ in trigram_freq.most_common(30)]
# Find signature patterns (unique combinations that appear multiple times)
signature_patterns = self._find_signature_patterns(bigram_freq, trigram_freq)
logger.info(f"Extracted {len(top_bigrams)} bigrams, {len(top_trigrams)} trigrams, "
f"{len(signature_patterns)} signature patterns")
return {
"typical_bigrams": top_bigrams,
"typical_trigrams": top_trigrams,
"signature_combinations": signature_patterns,
"bigram_count": len(bigrams),
"trigram_count": len(trigrams),
"available": True
}
except Exception as e:
logger.error(f"N-gram extraction failed: {e}")
return {
"typical_bigrams": [],
"typical_trigrams": [],
"signature_combinations": [],
"available": False,
"error": str(e)
}
def _find_signature_patterns(
self,
bigram_freq: Counter,
trigram_freq: Counter
) -> List[str]:
"""
Find signature patterns - unique combinations that are not generic.
Args:
bigram_freq: Counter of bigram frequencies
trigram_freq: Counter of trigram frequencies
Returns:
List of signature pattern strings
"""
# Generic German words to filter out
generic_words = {
'und', 'die', 'der', 'das', 'ist', 'sind', 'ein', 'eine',
'zu', 'den', 'für', 'mit', 'auf', 'von', 'in', 'des',
'dem', 'im', 'zum', 'zur', 'am', 'bei', 'hat', 'haben',
'wird', 'werden', 'kann', 'können', 'soll', 'müssen',
'auch', 'nur', 'noch', 'schon', 'sehr', 'mehr', 'aber',
'oder', 'wenn', 'dann', 'als', 'wie', 'nach', 'über'
}
signature_patterns = []
# Find non-generic bigrams that appear at least 3 times
for (w1, w2), count in bigram_freq.most_common(100):
if count >= 3: # Must appear multiple times
# Filter out generic combinations
if w1 not in generic_words or w2 not in generic_words:
pattern = f"{w1} {w2}"
if pattern not in signature_patterns:
signature_patterns.append(pattern)
# Find non-generic trigrams that appear at least 2 times
for (w1, w2, w3), count in trigram_freq.most_common(50):
if count >= 2:
# At least one word should not be generic
non_generic_count = sum(1 for w in [w1, w2, w3] if w not in generic_words)
if non_generic_count >= 2:
pattern = f"{w1} {w2} {w3}"
if pattern not in signature_patterns:
signature_patterns.append(pattern)
# Limit to top 20 most distinctive patterns
return signature_patterns[:20]