Onyva-Postling/scripts/backfill_usage_logs.py

"""Backfill api_usage_logs for all generated posts not yet tracked.

Assumes per post:
  - 1x gpt-4o call:      ~20,000 tokens (14,000 prompt + 6,000 completion)
  - 1x gpt-4o-mini call:  ~17,000 tokens (13,000 prompt + 4,000 completion)

Only processes posts older than 20 minutes whose created_at is not already
covered by an existing api_usage_log entry for the same customer.
"""
import asyncio
import sys
import os
from datetime import datetime, timedelta, timezone
from uuid import UUID

# Add project root to path
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

from src.config import estimate_cost
from src.database.client import db


# ── Estimated token splits per post ──────────────────────────────
GPT4O_PROMPT    = 14_000
GPT4O_COMP      =  6_000
GPT4O_TOTAL     = GPT4O_PROMPT + GPT4O_COMP        # 20 000

MINI_PROMPT     = 13_000
MINI_COMP       =  4_000
MINI_TOTAL      = MINI_PROMPT + MINI_COMP           # 17 000


async def main():
    cutoff = datetime.now(timezone.utc) - timedelta(minutes=20)
    print(f"Cutoff: posts created before {cutoff.isoformat()}")

    # ── 1. Load all generated posts ──────────────────────────────
    customers = await db.list_customers()
    print(f"Found {len(customers)} customers")

    all_posts = []
    for cust in customers:
        posts = await db.get_generated_posts(cust.id)
        all_posts.extend(posts)
    print(f"Found {len(all_posts)} total generated posts")

    # Filter to posts older than 20 min
    eligible = [
        p for p in all_posts
        if p.created_at and p.created_at.replace(tzinfo=timezone.utc) < cutoff
    ]
    print(f"{len(eligible)} posts older than 20 min")

    # ── 2. Load existing logs to avoid duplicates ────────────────
    try:
        existing_logs = await asyncio.to_thread(
            lambda: db.client.table("api_usage_logs")
                .select("customer_id, created_at")
                .eq("operation", "post_creation_backfill")
                .execute()
        )
        already_logged = set()
        for log in existing_logs.data:
            key = (log.get("customer_id"), log.get("created_at", "")[:19])
            already_logged.add(key)
        print(f"{len(already_logged)} existing backfill entries found")
    except Exception as e:
        print(f"Could not read existing logs (table may be new): {e}")
        already_logged = set()

    # ── 3. Build customer → user_id / company_id map ─────────────
    cust_map = {}
    for cust in customers:
        cust_map[str(cust.id)] = {
            "user_id": str(cust.user_id) if cust.user_id else None,
            "company_id": str(cust.company_id) if cust.company_id else None,
        }

    # ── 4. Insert two log rows per post ──────────────────────────
    inserted = 0
    skipped = 0

    for post in eligible:
        cid = str(post.customer_id)
        ts = post.created_at.isoformat()[:19] if post.created_at else ""
        key = (cid, ts)
        if key in already_logged:
            skipped += 1
            continue

        ids = cust_map.get(cid, {})
        user_id = ids.get("user_id")
        company_id = ids.get("company_id")

        base = {
            "customer_id": cid,
            "operation": "post_creation_backfill",
            "created_at": post.created_at.isoformat() if post.created_at else None,
        }
        if user_id:
            base["user_id"] = user_id
        if company_id:
            base["company_id"] = company_id

        # Row 1: gpt-4o
        gpt4o_cost = estimate_cost("gpt-4o", GPT4O_PROMPT, GPT4O_COMP)
        row_4o = {
            **base,
            "provider": "openai",
            "model": "gpt-4o",
            "prompt_tokens": GPT4O_PROMPT,
            "completion_tokens": GPT4O_COMP,
            "total_tokens": GPT4O_TOTAL,
            "estimated_cost_usd": round(gpt4o_cost, 6),
        }

        # Row 2: gpt-4o-mini
        mini_cost = estimate_cost("gpt-4o-mini", MINI_PROMPT, MINI_COMP)
        row_mini = {
            **base,
            "provider": "openai",
            "model": "gpt-4o-mini",
            "prompt_tokens": MINI_PROMPT,
            "completion_tokens": MINI_COMP,
            "total_tokens": MINI_TOTAL,
            "estimated_cost_usd": round(mini_cost, 6),
        }

        try:
            await asyncio.to_thread(
                lambda r1=row_4o, r2=row_mini: db.client.table("api_usage_logs")
                    .insert([r1, r2]).execute()
            )
            inserted += 2
            name = post.topic_title[:40] if post.topic_title else "?"
            print(f"  + {name}  (gpt-4o ${gpt4o_cost:.4f} + mini ${mini_cost:.4f})")
        except Exception as e:
            print(f"  ! Error for post {post.id}: {e}")

    print(f"\nDone: {inserted} log rows inserted, {skipped} posts skipped (already backfilled)")
    print(f"Estimated totals per post: gpt-4o ${estimate_cost('gpt-4o', GPT4O_PROMPT, GPT4O_COMP):.4f} + mini ${estimate_cost('gpt-4o-mini', MINI_PROMPT, MINI_COMP):.4f}")


if __name__ == "__main__":
    asyncio.run(main())