diff --git a/README.md b/README.md index 89ace00..82eca44 100644 --- a/README.md +++ b/README.md @@ -125,6 +125,12 @@ Open `http://localhost:5000`. - **Resilient Model Wrapper:** Wraps every Gemini API call with up to 3 retries and exponential backoff, handles quota errors and rate limits, and can switch to an alternative model mid-stream. - **Auto Model Selection:** On startup, a bootstrapper model queries the Gemini API and selects the optimal models for Logic, Writer, Artist, and Image roles. Selection is cached for 24 hours. - **Vertex AI Support:** If `GCP_PROJECT` is set and OAuth credentials are present, initializes Vertex AI automatically for Imagen image generation. +- **Payload Guardrails:** Every generation call estimates the prompt token count before dispatch. If the payload exceeds 30,000 tokens, a warning is logged so runaway context injection is surfaced immediately. + +### AI Context Optimization (`core/utils.py`) +- **Token Estimation:** `estimate_tokens(text)` provides a fast character-based token count approximation (`len(text) / 4`) without requiring external tokenizer libraries. +- **Context Truncation:** `truncate_to_tokens(text, max_tokens)` enforces hard caps on large context variables — previous chapter text, story summaries, and character data — before they are injected into prompts, preventing token overflows on large manuscripts. +- **AI Response Cache:** An in-memory cache (`_AI_CACHE`) keyed by MD5 hash of inputs prevents redundant API calls for deterministic tasks such as persona analysis. Results are reused for identical inputs within the same session. ### Cost Tracking Every AI call logs input/output token counts and estimated USD cost (using cached pricing per model). Cumulative project cost is stored in the database and displayed per user and per run. diff --git a/ai/models.py b/ai/models.py index 0449728..6018f57 100644 --- a/ai/models.py +++ b/ai/models.py @@ -45,7 +45,21 @@ class ResilientModel: self.name = name self.model = genai.GenerativeModel(name, safety_settings=self.safety_settings) + _TOKEN_WARN_LIMIT = 30_000 + def generate_content(self, *args, **kwargs): + # Estimate payload size and warn if it exceeds the safe limit + if args: + payload = args[0] + if isinstance(payload, str): + est = utils.estimate_tokens(payload) + elif isinstance(payload, list): + est = sum(utils.estimate_tokens(p) if isinstance(p, str) else 0 for p in payload) + else: + est = 0 + if est > self._TOKEN_WARN_LIMIT: + utils.log("SYSTEM", f"⚠️ Payload warning: ~{est:,} tokens for {self.role} ({self.name}). Consider reducing context.") + retries = 0 max_retries = 3 base_delay = 5 diff --git a/core/utils.py b/core/utils.py index 3a55134..e213a9c 100644 --- a/core/utils.py +++ b/core/utils.py @@ -2,6 +2,7 @@ import os import json import datetime import time +import hashlib from core import config import threading import re @@ -19,6 +20,40 @@ _log_context = threading.local() # Cache for dynamic pricing from AI model selection PRICING_CACHE = {} +# --- Token Estimation & Truncation Utilities --- + +def estimate_tokens(text): + """Estimate token count using a 4-chars-per-token heuristic (no external libs required).""" + if not text: + return 0 + return max(1, len(text) // 4) + +def truncate_to_tokens(text, max_tokens): + """Truncate text to approximately max_tokens, keeping the most recent (tail) content.""" + if not text: + return text + max_chars = max_tokens * 4 + if len(text) <= max_chars: + return text + return text[-max_chars:] + +# --- In-Memory AI Response Cache --- + +_AI_CACHE = {} + +def get_ai_cache(key): + """Retrieve a cached AI response by key. Returns None if not cached.""" + return _AI_CACHE.get(key) + +def set_ai_cache(key, value): + """Store an AI response in the in-memory cache keyed by a hash string.""" + _AI_CACHE[key] = value + +def make_cache_key(prefix, *parts): + """Build a stable MD5 cache key from a prefix and variable string parts.""" + raw = "|".join(str(p) for p in parts) + return f"{prefix}:{hashlib.md5(raw.encode('utf-8', errors='replace')).hexdigest()}" + def set_log_file(filepath): _log_context.log_file = filepath diff --git a/story/editor.py b/story/editor.py index 19bfdfa..a51e915 100644 --- a/story/editor.py +++ b/story/editor.py @@ -59,7 +59,7 @@ def evaluate_chapter_quality(text, chapter_title, genre, model, folder): }} """ try: - response = model.generate_content([prompt, text[:30000]]) + response = model.generate_content([prompt, utils.truncate_to_tokens(text, 7500)]) model_name = getattr(model, 'name', ai_models.logic_model_name) utils.log_usage(folder, model_name, response.usage_metadata) data = json.loads(utils.clean_json(response.text)) @@ -86,8 +86,8 @@ def check_pacing(bp, summary, last_chapter_text, last_chapter_data, remaining_ch TASK: Analyze pacing. CONTEXT: - - PREVIOUS_SUMMARY: {summary[-3000:]} - - CURRENT_CHAPTER: {last_chapter_text[-2000:]} + - PREVIOUS_SUMMARY: {utils.truncate_to_tokens(summary, 1000)} + - CURRENT_CHAPTER: {utils.truncate_to_tokens(last_chapter_text, 800)} - UPCOMING: {json.dumps([c['title'] for c in remaining_chapters[:3]])} - REMAINING_COUNT: {len(remaining_chapters)} @@ -254,7 +254,7 @@ def check_and_propagate(bp, manuscript, changed_chap_num, folder, change_summary TASK: Summarize the key events and ending state of this chapter for continuity tracking. TEXT: - {changed_chap.get('content', '')[:10000]} + {utils.truncate_to_tokens(changed_chap.get('content', ''), 2500)} FOCUS: - Major plot points. @@ -350,7 +350,7 @@ def check_and_propagate(bp, manuscript, changed_chap_num, folder, change_summary CHANGE_SUMMARY: {current_context} CHAPTER_TO_CHECK (Ch {target_chap['num']}): - {target_chap['content'][:12000]} + {utils.truncate_to_tokens(target_chap['content'], 3000)} DECISION_LOGIC: - If the chapter directly contradicts the change (references dead characters, items that no longer exist, events that didn't happen), status = REWRITE. diff --git a/story/writer.py b/story/writer.py index c143c89..4145d3a 100644 --- a/story/writer.py +++ b/story/writer.py @@ -71,7 +71,7 @@ def write_chapter(chap, bp, folder, prev_sum, tracking=None, prev_content=None, prev_context_block = "" if prev_content: - trunc_content = prev_content[-3000:] if len(prev_content) > 3000 else prev_content + trunc_content = utils.truncate_to_tokens(prev_content, 2000) prev_context_block = f"\nPREVIOUS CHAPTER TEXT (For Tone & Continuity):\n{trunc_content}\n" chars_for_writer = [ @@ -238,7 +238,7 @@ def write_chapter(chap, bp, folder, prev_sum, tracking=None, prev_content=None, HARD_CONSTRAINTS: - TARGET_WORDS: ~{est_words} words (aim for this; ±20% is acceptable if the scene genuinely demands it — but do not condense beats to save space) - BEATS MUST BE COVERED: {json.dumps(chap.get('beats', []))} - - SUMMARY CONTEXT: {prev_sum[:1500]} + - SUMMARY CONTEXT: {utils.truncate_to_tokens(prev_sum, 600)} AUTHOR_VOICE: {persona_info} diff --git a/web/routes/persona.py b/web/routes/persona.py index dc3e5dc..20c5353 100644 --- a/web/routes/persona.py +++ b/web/routes/persona.py @@ -112,6 +112,19 @@ def analyze_persona(): data = request.json sample = data.get('sample_text', '') + # Cache by a hash of the inputs to avoid redundant API calls for unchanged data + cache_key = utils.make_cache_key( + "persona_analyze", + data.get('name', ''), + data.get('age', ''), + data.get('gender', ''), + data.get('nationality', ''), + sample[:500] + ) + cached = utils.get_ai_cache(cache_key) + if cached: + return cached + prompt = f""" ROLE: Literary Analyst TASK: Create or analyze an Author Persona profile. @@ -119,7 +132,7 @@ def analyze_persona(): INPUT_DATA: - NAME: {data.get('name')} - DEMOGRAPHICS: Age: {data.get('age')} | Gender: {data.get('gender')} | Nationality: {data.get('nationality')} - - SAMPLE_TEXT: {sample[:3000]} + - SAMPLE_TEXT: {utils.truncate_to_tokens(sample, 750)} INSTRUCTIONS: 1. BIO: Write a 2-3 sentence description of the writing style. If sample is provided, analyze it. If not, invent a style that fits the demographics/name. @@ -130,6 +143,8 @@ def analyze_persona(): """ try: response = ai_models.model_logic.generate_content(prompt) - return json.loads(utils.clean_json(response.text)) + result = json.loads(utils.clean_json(response.text)) + utils.set_ai_cache(cache_key, result) + return result except Exception as e: return {"error": str(e)}, 500