new quality save layer
This commit is contained in:
@@ -1,6 +1,7 @@
|
||||
"""Profile analyzer agent."""
|
||||
import json
|
||||
from typing import Dict, Any, List
|
||||
from collections import Counter
|
||||
from loguru import logger
|
||||
|
||||
from src.agents.base import BaseAgent
|
||||
@@ -61,7 +62,13 @@ class ProfileAnalyzerAgent(BaseAgent):
|
||||
|
||||
# Parse JSON response
|
||||
analysis = json.loads(response)
|
||||
logger.info("Profile analysis completed successfully")
|
||||
|
||||
# Add N-gram analysis
|
||||
post_texts = [p.post_text for p in posts if p.post_text]
|
||||
ngram_patterns = self._extract_ngram_patterns(post_texts)
|
||||
analysis["ngram_patterns"] = ngram_patterns
|
||||
|
||||
logger.info("Profile analysis completed successfully (with N-gram analysis)")
|
||||
|
||||
return analysis
|
||||
|
||||
@@ -298,3 +305,130 @@ Erstelle eine umfassende Analyse im folgenden JSON-Format:
|
||||
}}
|
||||
|
||||
KRITISCH: Bei phrase_library und structure_templates müssen ECHTE, WÖRTLICHE Beispiele aus den Posts stehen! Keine generischen Beschreibungen!"""
|
||||
|
||||
def _extract_ngram_patterns(self, post_texts: List[str]) -> Dict[str, Any]:
|
||||
"""
|
||||
Extract N-gram patterns from posts for better style fingerprinting.
|
||||
|
||||
Args:
|
||||
post_texts: List of post texts
|
||||
|
||||
Returns:
|
||||
Dictionary with bigrams, trigrams, and signature patterns
|
||||
"""
|
||||
if not post_texts or len(post_texts) == 0:
|
||||
return {
|
||||
"typical_bigrams": [],
|
||||
"typical_trigrams": [],
|
||||
"signature_combinations": [],
|
||||
"available": False
|
||||
}
|
||||
|
||||
try:
|
||||
import nltk
|
||||
from nltk import ngrams, word_tokenize
|
||||
|
||||
# Download required NLTK data if not available
|
||||
try:
|
||||
nltk.data.find('tokenizers/punkt')
|
||||
except LookupError:
|
||||
logger.info("Downloading NLTK punkt tokenizer...")
|
||||
nltk.download('punkt', quiet=True)
|
||||
|
||||
# Combine all text
|
||||
all_text = " ".join(post_texts)
|
||||
|
||||
# Tokenize (try German tokenizer, fallback to default)
|
||||
try:
|
||||
words = word_tokenize(all_text.lower(), language='german')
|
||||
except:
|
||||
words = word_tokenize(all_text.lower())
|
||||
|
||||
# Remove punctuation and very short words
|
||||
words = [w for w in words if w.isalnum() and len(w) > 2]
|
||||
|
||||
# Extract bigrams (2-word combinations)
|
||||
bigrams = list(ngrams(words, 2))
|
||||
bigram_freq = Counter(bigrams)
|
||||
|
||||
# Extract trigrams (3-word combinations)
|
||||
trigrams = list(ngrams(words, 3))
|
||||
trigram_freq = Counter(trigrams)
|
||||
|
||||
# Get top bigrams and trigrams
|
||||
top_bigrams = [" ".join(bg) for bg, _ in bigram_freq.most_common(50)]
|
||||
top_trigrams = [" ".join(tg) for tg, _ in trigram_freq.most_common(30)]
|
||||
|
||||
# Find signature patterns (unique combinations that appear multiple times)
|
||||
signature_patterns = self._find_signature_patterns(bigram_freq, trigram_freq)
|
||||
|
||||
logger.info(f"Extracted {len(top_bigrams)} bigrams, {len(top_trigrams)} trigrams, "
|
||||
f"{len(signature_patterns)} signature patterns")
|
||||
|
||||
return {
|
||||
"typical_bigrams": top_bigrams,
|
||||
"typical_trigrams": top_trigrams,
|
||||
"signature_combinations": signature_patterns,
|
||||
"bigram_count": len(bigrams),
|
||||
"trigram_count": len(trigrams),
|
||||
"available": True
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"N-gram extraction failed: {e}")
|
||||
return {
|
||||
"typical_bigrams": [],
|
||||
"typical_trigrams": [],
|
||||
"signature_combinations": [],
|
||||
"available": False,
|
||||
"error": str(e)
|
||||
}
|
||||
|
||||
def _find_signature_patterns(
|
||||
self,
|
||||
bigram_freq: Counter,
|
||||
trigram_freq: Counter
|
||||
) -> List[str]:
|
||||
"""
|
||||
Find signature patterns - unique combinations that are not generic.
|
||||
|
||||
Args:
|
||||
bigram_freq: Counter of bigram frequencies
|
||||
trigram_freq: Counter of trigram frequencies
|
||||
|
||||
Returns:
|
||||
List of signature pattern strings
|
||||
"""
|
||||
# Generic German words to filter out
|
||||
generic_words = {
|
||||
'und', 'die', 'der', 'das', 'ist', 'sind', 'ein', 'eine',
|
||||
'zu', 'den', 'für', 'mit', 'auf', 'von', 'in', 'des',
|
||||
'dem', 'im', 'zum', 'zur', 'am', 'bei', 'hat', 'haben',
|
||||
'wird', 'werden', 'kann', 'können', 'soll', 'müssen',
|
||||
'auch', 'nur', 'noch', 'schon', 'sehr', 'mehr', 'aber',
|
||||
'oder', 'wenn', 'dann', 'als', 'wie', 'nach', 'über'
|
||||
}
|
||||
|
||||
signature_patterns = []
|
||||
|
||||
# Find non-generic bigrams that appear at least 3 times
|
||||
for (w1, w2), count in bigram_freq.most_common(100):
|
||||
if count >= 3: # Must appear multiple times
|
||||
# Filter out generic combinations
|
||||
if w1 not in generic_words or w2 not in generic_words:
|
||||
pattern = f"{w1} {w2}"
|
||||
if pattern not in signature_patterns:
|
||||
signature_patterns.append(pattern)
|
||||
|
||||
# Find non-generic trigrams that appear at least 2 times
|
||||
for (w1, w2, w3), count in trigram_freq.most_common(50):
|
||||
if count >= 2:
|
||||
# At least one word should not be generic
|
||||
non_generic_count = sum(1 for w in [w1, w2, w3] if w not in generic_words)
|
||||
if non_generic_count >= 2:
|
||||
pattern = f"{w1} {w2} {w3}"
|
||||
if pattern not in signature_patterns:
|
||||
signature_patterns.append(pattern)
|
||||
|
||||
# Limit to top 20 most distinctive patterns
|
||||
return signature_patterns[:20]
|
||||
|
||||
Reference in New Issue
Block a user