added scalability and performance improvements (redis, http caching etc)

This commit is contained in:
2026-02-19 17:19:41 +01:00
parent d8d054c9a8
commit 4b15b552d6
12 changed files with 763 additions and 88 deletions

View File

@@ -32,10 +32,11 @@ from src.services.email_service import (
mark_token_used,
)
from src.services.background_jobs import (
job_manager, JobType, JobStatus,
JobType, JobStatus,
run_post_scraping, run_profile_analysis, run_post_categorization, run_post_type_analysis,
run_full_analysis_pipeline, run_post_recategorization
)
from src.services.db_job_manager import job_manager
from src.services.storage_service import storage
# Router for user frontend
@@ -93,6 +94,7 @@ async def get_user_avatar(session: UserSession, user_id: UUID) -> Optional[str]:
return None
def require_user_session(request: Request) -> Optional[UserSession]:
"""Check if user is authenticated, redirect to login if not."""
session = get_user_session(request)
@@ -676,7 +678,7 @@ async def onboarding_profile_submit(
logger.info(f"Skipping scraping - {len(existing_posts)} posts already exist for user {user_id}")
if should_scrape:
job = job_manager.create_job(JobType.POST_SCRAPING, str(user_id))
job = await job_manager.create_job(JobType.POST_SCRAPING, str(user_id))
background_tasks.add_task(run_post_scraping, user_id, linkedin_url, job.id)
logger.info(f"Started background scraping for user {user_id}")
@@ -829,7 +831,7 @@ async def api_rescrape(request: Request, background_tasks: BackgroundTasks):
return JSONResponse({"error": "No LinkedIn URL found"}, status_code=400)
# Create job and start scraping
job = job_manager.create_job(JobType.POST_SCRAPING, session.user_id)
job = await job_manager.create_job(JobType.POST_SCRAPING, session.user_id)
background_tasks.add_task(run_post_scraping, user_id, profile.linkedin_url, job.id)
return JSONResponse({"success": True, "job_id": job.id})
@@ -1451,53 +1453,45 @@ async def api_categorize_post(request: Request):
@user_router.get("/api/job-updates")
async def job_updates_sse(request: Request):
"""Server-Sent Events endpoint for job updates."""
"""Server-Sent Events endpoint for job updates (Redis pub/sub — works across workers)."""
session = require_user_session(request)
tracking_id = getattr(session, 'user_id', None) or getattr(session, 'company_id', None)
if not session or not tracking_id:
return JSONResponse({"error": "Not authenticated"}, status_code=401)
async def event_generator():
queue = asyncio.Queue()
async def on_job_update(job):
await queue.put(job)
# Register listener
job_manager.add_listener(tracking_id, on_job_update)
from src.services.redis_client import get_redis
r = await get_redis()
pubsub = r.pubsub()
await pubsub.subscribe(f"job_updates:{tracking_id}")
try:
# Send initial active jobs
active_jobs = job_manager.get_active_jobs(tracking_id)
for job in active_jobs:
# Send any currently active jobs as the initial state
for job in await job_manager.get_active_jobs(tracking_id):
data = {
"id": job.id,
"job_type": job.job_type.value,
"status": job.status.value,
"progress": job.progress,
"message": job.message,
"error": job.error
"error": job.error,
}
yield f"data: {json.dumps(data)}\n\n"
# Stream updates
# Stream pub/sub messages, keepalive on timeout
while True:
try:
job = await asyncio.wait_for(queue.get(), timeout=30)
data = {
"id": job.id,
"job_type": job.job_type.value,
"status": job.status.value,
"progress": job.progress,
"message": job.message,
"error": job.error
}
yield f"data: {json.dumps(data)}\n\n"
msg = await asyncio.wait_for(
pubsub.get_message(ignore_subscribe_messages=True), timeout=30
)
if msg and msg.get("type") == "message":
yield f"data: {msg['data']}\n\n"
else:
yield ": keepalive\n\n"
except asyncio.TimeoutError:
# Send keepalive
yield ": keepalive\n\n"
finally:
job_manager.remove_listener(tracking_id, on_job_update)
await pubsub.unsubscribe(f"job_updates:{tracking_id}")
await pubsub.aclose()
return StreamingResponse(
event_generator(),
@@ -1505,8 +1499,8 @@ async def job_updates_sse(request: Request):
headers={
"Cache-Control": "no-cache",
"Connection": "keep-alive",
"X-Accel-Buffering": "no"
}
"X-Accel-Buffering": "no",
},
)
@@ -1521,7 +1515,7 @@ async def api_run_post_type_analysis(request: Request, background_tasks: Backgro
user_id = UUID(session.user_id)
# Create job
job = job_manager.create_job(JobType.POST_TYPE_ANALYSIS, session.user_id)
job = await job_manager.create_job(JobType.POST_TYPE_ANALYSIS, session.user_id)
# Run in background
background_tasks.add_task(run_post_type_analysis, user_id, job.id)
@@ -3278,13 +3272,13 @@ async def save_all_and_reanalyze(request: Request, background_tasks: BackgroundT
# Only trigger re-categorization and analysis if there were structural changes
if has_structural_changes:
# Create background job for post re-categorization (ALL posts)
categorization_job = job_manager.create_job(
categorization_job = await job_manager.create_job(
job_type=JobType.POST_CATEGORIZATION,
user_id=user_id_str
)
# Create background job for post type analysis
analysis_job = job_manager.create_job(
analysis_job = await job_manager.create_job(
job_type=JobType.POST_TYPE_ANALYSIS,
user_id=user_id_str
)