Blueprint v1.0.4: Implemented AI Context Optimization & Token Management
- core/utils.py: Added estimate_tokens(), truncate_to_tokens(), get_ai_cache(), set_ai_cache(), make_cache_key() utilities - story/writer.py: Applied truncate_to_tokens() to prev_content (2000 tokens) and prev_sum (600 tokens) context injections - story/editor.py: Applied truncate_to_tokens() to summary (1000t), last_chapter_text (800t), eval text (7500t), propagation contexts (2500t/3000t) - web/routes/persona.py: Added MD5-keyed in-memory cache for persona analyze endpoint; truncated sample_text to 750 tokens - ai/models.py: Added pre-dispatch payload size estimation with 30k-token warning threshold Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -125,6 +125,12 @@ Open `http://localhost:5000`.
|
|||||||
- **Resilient Model Wrapper:** Wraps every Gemini API call with up to 3 retries and exponential backoff, handles quota errors and rate limits, and can switch to an alternative model mid-stream.
|
- **Resilient Model Wrapper:** Wraps every Gemini API call with up to 3 retries and exponential backoff, handles quota errors and rate limits, and can switch to an alternative model mid-stream.
|
||||||
- **Auto Model Selection:** On startup, a bootstrapper model queries the Gemini API and selects the optimal models for Logic, Writer, Artist, and Image roles. Selection is cached for 24 hours.
|
- **Auto Model Selection:** On startup, a bootstrapper model queries the Gemini API and selects the optimal models for Logic, Writer, Artist, and Image roles. Selection is cached for 24 hours.
|
||||||
- **Vertex AI Support:** If `GCP_PROJECT` is set and OAuth credentials are present, initializes Vertex AI automatically for Imagen image generation.
|
- **Vertex AI Support:** If `GCP_PROJECT` is set and OAuth credentials are present, initializes Vertex AI automatically for Imagen image generation.
|
||||||
|
- **Payload Guardrails:** Every generation call estimates the prompt token count before dispatch. If the payload exceeds 30,000 tokens, a warning is logged so runaway context injection is surfaced immediately.
|
||||||
|
|
||||||
|
### AI Context Optimization (`core/utils.py`)
|
||||||
|
- **Token Estimation:** `estimate_tokens(text)` provides a fast character-based token count approximation (`len(text) / 4`) without requiring external tokenizer libraries.
|
||||||
|
- **Context Truncation:** `truncate_to_tokens(text, max_tokens)` enforces hard caps on large context variables — previous chapter text, story summaries, and character data — before they are injected into prompts, preventing token overflows on large manuscripts.
|
||||||
|
- **AI Response Cache:** An in-memory cache (`_AI_CACHE`) keyed by MD5 hash of inputs prevents redundant API calls for deterministic tasks such as persona analysis. Results are reused for identical inputs within the same session.
|
||||||
|
|
||||||
### Cost Tracking
|
### Cost Tracking
|
||||||
Every AI call logs input/output token counts and estimated USD cost (using cached pricing per model). Cumulative project cost is stored in the database and displayed per user and per run.
|
Every AI call logs input/output token counts and estimated USD cost (using cached pricing per model). Cumulative project cost is stored in the database and displayed per user and per run.
|
||||||
|
|||||||
14
ai/models.py
14
ai/models.py
@@ -45,7 +45,21 @@ class ResilientModel:
|
|||||||
self.name = name
|
self.name = name
|
||||||
self.model = genai.GenerativeModel(name, safety_settings=self.safety_settings)
|
self.model = genai.GenerativeModel(name, safety_settings=self.safety_settings)
|
||||||
|
|
||||||
|
_TOKEN_WARN_LIMIT = 30_000
|
||||||
|
|
||||||
def generate_content(self, *args, **kwargs):
|
def generate_content(self, *args, **kwargs):
|
||||||
|
# Estimate payload size and warn if it exceeds the safe limit
|
||||||
|
if args:
|
||||||
|
payload = args[0]
|
||||||
|
if isinstance(payload, str):
|
||||||
|
est = utils.estimate_tokens(payload)
|
||||||
|
elif isinstance(payload, list):
|
||||||
|
est = sum(utils.estimate_tokens(p) if isinstance(p, str) else 0 for p in payload)
|
||||||
|
else:
|
||||||
|
est = 0
|
||||||
|
if est > self._TOKEN_WARN_LIMIT:
|
||||||
|
utils.log("SYSTEM", f"⚠️ Payload warning: ~{est:,} tokens for {self.role} ({self.name}). Consider reducing context.")
|
||||||
|
|
||||||
retries = 0
|
retries = 0
|
||||||
max_retries = 3
|
max_retries = 3
|
||||||
base_delay = 5
|
base_delay = 5
|
||||||
|
|||||||
@@ -2,6 +2,7 @@ import os
|
|||||||
import json
|
import json
|
||||||
import datetime
|
import datetime
|
||||||
import time
|
import time
|
||||||
|
import hashlib
|
||||||
from core import config
|
from core import config
|
||||||
import threading
|
import threading
|
||||||
import re
|
import re
|
||||||
@@ -19,6 +20,40 @@ _log_context = threading.local()
|
|||||||
# Cache for dynamic pricing from AI model selection
|
# Cache for dynamic pricing from AI model selection
|
||||||
PRICING_CACHE = {}
|
PRICING_CACHE = {}
|
||||||
|
|
||||||
|
# --- Token Estimation & Truncation Utilities ---
|
||||||
|
|
||||||
|
def estimate_tokens(text):
|
||||||
|
"""Estimate token count using a 4-chars-per-token heuristic (no external libs required)."""
|
||||||
|
if not text:
|
||||||
|
return 0
|
||||||
|
return max(1, len(text) // 4)
|
||||||
|
|
||||||
|
def truncate_to_tokens(text, max_tokens):
|
||||||
|
"""Truncate text to approximately max_tokens, keeping the most recent (tail) content."""
|
||||||
|
if not text:
|
||||||
|
return text
|
||||||
|
max_chars = max_tokens * 4
|
||||||
|
if len(text) <= max_chars:
|
||||||
|
return text
|
||||||
|
return text[-max_chars:]
|
||||||
|
|
||||||
|
# --- In-Memory AI Response Cache ---
|
||||||
|
|
||||||
|
_AI_CACHE = {}
|
||||||
|
|
||||||
|
def get_ai_cache(key):
|
||||||
|
"""Retrieve a cached AI response by key. Returns None if not cached."""
|
||||||
|
return _AI_CACHE.get(key)
|
||||||
|
|
||||||
|
def set_ai_cache(key, value):
|
||||||
|
"""Store an AI response in the in-memory cache keyed by a hash string."""
|
||||||
|
_AI_CACHE[key] = value
|
||||||
|
|
||||||
|
def make_cache_key(prefix, *parts):
|
||||||
|
"""Build a stable MD5 cache key from a prefix and variable string parts."""
|
||||||
|
raw = "|".join(str(p) for p in parts)
|
||||||
|
return f"{prefix}:{hashlib.md5(raw.encode('utf-8', errors='replace')).hexdigest()}"
|
||||||
|
|
||||||
def set_log_file(filepath):
|
def set_log_file(filepath):
|
||||||
_log_context.log_file = filepath
|
_log_context.log_file = filepath
|
||||||
|
|
||||||
|
|||||||
@@ -59,7 +59,7 @@ def evaluate_chapter_quality(text, chapter_title, genre, model, folder):
|
|||||||
}}
|
}}
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
response = model.generate_content([prompt, text[:30000]])
|
response = model.generate_content([prompt, utils.truncate_to_tokens(text, 7500)])
|
||||||
model_name = getattr(model, 'name', ai_models.logic_model_name)
|
model_name = getattr(model, 'name', ai_models.logic_model_name)
|
||||||
utils.log_usage(folder, model_name, response.usage_metadata)
|
utils.log_usage(folder, model_name, response.usage_metadata)
|
||||||
data = json.loads(utils.clean_json(response.text))
|
data = json.loads(utils.clean_json(response.text))
|
||||||
@@ -86,8 +86,8 @@ def check_pacing(bp, summary, last_chapter_text, last_chapter_data, remaining_ch
|
|||||||
TASK: Analyze pacing.
|
TASK: Analyze pacing.
|
||||||
|
|
||||||
CONTEXT:
|
CONTEXT:
|
||||||
- PREVIOUS_SUMMARY: {summary[-3000:]}
|
- PREVIOUS_SUMMARY: {utils.truncate_to_tokens(summary, 1000)}
|
||||||
- CURRENT_CHAPTER: {last_chapter_text[-2000:]}
|
- CURRENT_CHAPTER: {utils.truncate_to_tokens(last_chapter_text, 800)}
|
||||||
- UPCOMING: {json.dumps([c['title'] for c in remaining_chapters[:3]])}
|
- UPCOMING: {json.dumps([c['title'] for c in remaining_chapters[:3]])}
|
||||||
- REMAINING_COUNT: {len(remaining_chapters)}
|
- REMAINING_COUNT: {len(remaining_chapters)}
|
||||||
|
|
||||||
@@ -254,7 +254,7 @@ def check_and_propagate(bp, manuscript, changed_chap_num, folder, change_summary
|
|||||||
TASK: Summarize the key events and ending state of this chapter for continuity tracking.
|
TASK: Summarize the key events and ending state of this chapter for continuity tracking.
|
||||||
|
|
||||||
TEXT:
|
TEXT:
|
||||||
{changed_chap.get('content', '')[:10000]}
|
{utils.truncate_to_tokens(changed_chap.get('content', ''), 2500)}
|
||||||
|
|
||||||
FOCUS:
|
FOCUS:
|
||||||
- Major plot points.
|
- Major plot points.
|
||||||
@@ -350,7 +350,7 @@ def check_and_propagate(bp, manuscript, changed_chap_num, folder, change_summary
|
|||||||
CHANGE_SUMMARY: {current_context}
|
CHANGE_SUMMARY: {current_context}
|
||||||
|
|
||||||
CHAPTER_TO_CHECK (Ch {target_chap['num']}):
|
CHAPTER_TO_CHECK (Ch {target_chap['num']}):
|
||||||
{target_chap['content'][:12000]}
|
{utils.truncate_to_tokens(target_chap['content'], 3000)}
|
||||||
|
|
||||||
DECISION_LOGIC:
|
DECISION_LOGIC:
|
||||||
- If the chapter directly contradicts the change (references dead characters, items that no longer exist, events that didn't happen), status = REWRITE.
|
- If the chapter directly contradicts the change (references dead characters, items that no longer exist, events that didn't happen), status = REWRITE.
|
||||||
|
|||||||
@@ -71,7 +71,7 @@ def write_chapter(chap, bp, folder, prev_sum, tracking=None, prev_content=None,
|
|||||||
|
|
||||||
prev_context_block = ""
|
prev_context_block = ""
|
||||||
if prev_content:
|
if prev_content:
|
||||||
trunc_content = prev_content[-3000:] if len(prev_content) > 3000 else prev_content
|
trunc_content = utils.truncate_to_tokens(prev_content, 2000)
|
||||||
prev_context_block = f"\nPREVIOUS CHAPTER TEXT (For Tone & Continuity):\n{trunc_content}\n"
|
prev_context_block = f"\nPREVIOUS CHAPTER TEXT (For Tone & Continuity):\n{trunc_content}\n"
|
||||||
|
|
||||||
chars_for_writer = [
|
chars_for_writer = [
|
||||||
@@ -238,7 +238,7 @@ def write_chapter(chap, bp, folder, prev_sum, tracking=None, prev_content=None,
|
|||||||
HARD_CONSTRAINTS:
|
HARD_CONSTRAINTS:
|
||||||
- TARGET_WORDS: ~{est_words} words (aim for this; ±20% is acceptable if the scene genuinely demands it — but do not condense beats to save space)
|
- TARGET_WORDS: ~{est_words} words (aim for this; ±20% is acceptable if the scene genuinely demands it — but do not condense beats to save space)
|
||||||
- BEATS MUST BE COVERED: {json.dumps(chap.get('beats', []))}
|
- BEATS MUST BE COVERED: {json.dumps(chap.get('beats', []))}
|
||||||
- SUMMARY CONTEXT: {prev_sum[:1500]}
|
- SUMMARY CONTEXT: {utils.truncate_to_tokens(prev_sum, 600)}
|
||||||
|
|
||||||
AUTHOR_VOICE:
|
AUTHOR_VOICE:
|
||||||
{persona_info}
|
{persona_info}
|
||||||
|
|||||||
@@ -112,6 +112,19 @@ def analyze_persona():
|
|||||||
data = request.json
|
data = request.json
|
||||||
sample = data.get('sample_text', '')
|
sample = data.get('sample_text', '')
|
||||||
|
|
||||||
|
# Cache by a hash of the inputs to avoid redundant API calls for unchanged data
|
||||||
|
cache_key = utils.make_cache_key(
|
||||||
|
"persona_analyze",
|
||||||
|
data.get('name', ''),
|
||||||
|
data.get('age', ''),
|
||||||
|
data.get('gender', ''),
|
||||||
|
data.get('nationality', ''),
|
||||||
|
sample[:500]
|
||||||
|
)
|
||||||
|
cached = utils.get_ai_cache(cache_key)
|
||||||
|
if cached:
|
||||||
|
return cached
|
||||||
|
|
||||||
prompt = f"""
|
prompt = f"""
|
||||||
ROLE: Literary Analyst
|
ROLE: Literary Analyst
|
||||||
TASK: Create or analyze an Author Persona profile.
|
TASK: Create or analyze an Author Persona profile.
|
||||||
@@ -119,7 +132,7 @@ def analyze_persona():
|
|||||||
INPUT_DATA:
|
INPUT_DATA:
|
||||||
- NAME: {data.get('name')}
|
- NAME: {data.get('name')}
|
||||||
- DEMOGRAPHICS: Age: {data.get('age')} | Gender: {data.get('gender')} | Nationality: {data.get('nationality')}
|
- DEMOGRAPHICS: Age: {data.get('age')} | Gender: {data.get('gender')} | Nationality: {data.get('nationality')}
|
||||||
- SAMPLE_TEXT: {sample[:3000]}
|
- SAMPLE_TEXT: {utils.truncate_to_tokens(sample, 750)}
|
||||||
|
|
||||||
INSTRUCTIONS:
|
INSTRUCTIONS:
|
||||||
1. BIO: Write a 2-3 sentence description of the writing style. If sample is provided, analyze it. If not, invent a style that fits the demographics/name.
|
1. BIO: Write a 2-3 sentence description of the writing style. If sample is provided, analyze it. If not, invent a style that fits the demographics/name.
|
||||||
@@ -130,6 +143,8 @@ def analyze_persona():
|
|||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
response = ai_models.model_logic.generate_content(prompt)
|
response = ai_models.model_logic.generate_content(prompt)
|
||||||
return json.loads(utils.clean_json(response.text))
|
result = json.loads(utils.clean_json(response.text))
|
||||||
|
utils.set_ai_cache(cache_key, result)
|
||||||
|
return result
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
return {"error": str(e)}, 500
|
return {"error": str(e)}, 500
|
||||||
|
|||||||
Reference in New Issue
Block a user