Compare commits

...

2 Commits

Author SHA1 Message Date
28a1308fbc Fix port mismatch: align Flask server to port 5000
web/app.py was hardcoded to port 7070, causing Docker port forwarding
(5000:5000) and the Dockerfile HEALTHCHECK to fail. Changed to port 5000
to match docker-compose.yml and Dockerfile configuration.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-02-20 23:40:24 -05:00
db70ad81f7 Blueprint v1.0.4: Implemented AI Context Optimization & Token Management
- core/utils.py: Added estimate_tokens(), truncate_to_tokens(), get_ai_cache(), set_ai_cache(), make_cache_key() utilities
- story/writer.py: Applied truncate_to_tokens() to prev_content (2000 tokens) and prev_sum (600 tokens) context injections
- story/editor.py: Applied truncate_to_tokens() to summary (1000t), last_chapter_text (800t), eval text (7500t), propagation contexts (2500t/3000t)
- web/routes/persona.py: Added MD5-keyed in-memory cache for persona analyze endpoint; truncated sample_text to 750 tokens
- ai/models.py: Added pre-dispatch payload size estimation with 30k-token warning threshold

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-02-20 23:30:39 -05:00
7 changed files with 80 additions and 10 deletions

View File

@@ -125,6 +125,12 @@ Open `http://localhost:5000`.
- **Resilient Model Wrapper:** Wraps every Gemini API call with up to 3 retries and exponential backoff, handles quota errors and rate limits, and can switch to an alternative model mid-stream. - **Resilient Model Wrapper:** Wraps every Gemini API call with up to 3 retries and exponential backoff, handles quota errors and rate limits, and can switch to an alternative model mid-stream.
- **Auto Model Selection:** On startup, a bootstrapper model queries the Gemini API and selects the optimal models for Logic, Writer, Artist, and Image roles. Selection is cached for 24 hours. - **Auto Model Selection:** On startup, a bootstrapper model queries the Gemini API and selects the optimal models for Logic, Writer, Artist, and Image roles. Selection is cached for 24 hours.
- **Vertex AI Support:** If `GCP_PROJECT` is set and OAuth credentials are present, initializes Vertex AI automatically for Imagen image generation. - **Vertex AI Support:** If `GCP_PROJECT` is set and OAuth credentials are present, initializes Vertex AI automatically for Imagen image generation.
- **Payload Guardrails:** Every generation call estimates the prompt token count before dispatch. If the payload exceeds 30,000 tokens, a warning is logged so runaway context injection is surfaced immediately.
### AI Context Optimization (`core/utils.py`)
- **Token Estimation:** `estimate_tokens(text)` provides a fast character-based token count approximation (`len(text) / 4`) without requiring external tokenizer libraries.
- **Context Truncation:** `truncate_to_tokens(text, max_tokens)` enforces hard caps on large context variables — previous chapter text, story summaries, and character data — before they are injected into prompts, preventing token overflows on large manuscripts.
- **AI Response Cache:** An in-memory cache (`_AI_CACHE`) keyed by MD5 hash of inputs prevents redundant API calls for deterministic tasks such as persona analysis. Results are reused for identical inputs within the same session.
### Cost Tracking ### Cost Tracking
Every AI call logs input/output token counts and estimated USD cost (using cached pricing per model). Cumulative project cost is stored in the database and displayed per user and per run. Every AI call logs input/output token counts and estimated USD cost (using cached pricing per model). Cumulative project cost is stored in the database and displayed per user and per run.

View File

@@ -45,7 +45,21 @@ class ResilientModel:
self.name = name self.name = name
self.model = genai.GenerativeModel(name, safety_settings=self.safety_settings) self.model = genai.GenerativeModel(name, safety_settings=self.safety_settings)
_TOKEN_WARN_LIMIT = 30_000
def generate_content(self, *args, **kwargs): def generate_content(self, *args, **kwargs):
# Estimate payload size and warn if it exceeds the safe limit
if args:
payload = args[0]
if isinstance(payload, str):
est = utils.estimate_tokens(payload)
elif isinstance(payload, list):
est = sum(utils.estimate_tokens(p) if isinstance(p, str) else 0 for p in payload)
else:
est = 0
if est > self._TOKEN_WARN_LIMIT:
utils.log("SYSTEM", f"⚠️ Payload warning: ~{est:,} tokens for {self.role} ({self.name}). Consider reducing context.")
retries = 0 retries = 0
max_retries = 3 max_retries = 3
base_delay = 5 base_delay = 5

View File

@@ -2,6 +2,7 @@ import os
import json import json
import datetime import datetime
import time import time
import hashlib
from core import config from core import config
import threading import threading
import re import re
@@ -19,6 +20,40 @@ _log_context = threading.local()
# Cache for dynamic pricing from AI model selection # Cache for dynamic pricing from AI model selection
PRICING_CACHE = {} PRICING_CACHE = {}
# --- Token Estimation & Truncation Utilities ---
def estimate_tokens(text):
"""Estimate token count using a 4-chars-per-token heuristic (no external libs required)."""
if not text:
return 0
return max(1, len(text) // 4)
def truncate_to_tokens(text, max_tokens):
"""Truncate text to approximately max_tokens, keeping the most recent (tail) content."""
if not text:
return text
max_chars = max_tokens * 4
if len(text) <= max_chars:
return text
return text[-max_chars:]
# --- In-Memory AI Response Cache ---
_AI_CACHE = {}
def get_ai_cache(key):
"""Retrieve a cached AI response by key. Returns None if not cached."""
return _AI_CACHE.get(key)
def set_ai_cache(key, value):
"""Store an AI response in the in-memory cache keyed by a hash string."""
_AI_CACHE[key] = value
def make_cache_key(prefix, *parts):
"""Build a stable MD5 cache key from a prefix and variable string parts."""
raw = "|".join(str(p) for p in parts)
return f"{prefix}:{hashlib.md5(raw.encode('utf-8', errors='replace')).hexdigest()}"
def set_log_file(filepath): def set_log_file(filepath):
_log_context.log_file = filepath _log_context.log_file = filepath

View File

@@ -59,7 +59,7 @@ def evaluate_chapter_quality(text, chapter_title, genre, model, folder):
}} }}
""" """
try: try:
response = model.generate_content([prompt, text[:30000]]) response = model.generate_content([prompt, utils.truncate_to_tokens(text, 7500)])
model_name = getattr(model, 'name', ai_models.logic_model_name) model_name = getattr(model, 'name', ai_models.logic_model_name)
utils.log_usage(folder, model_name, response.usage_metadata) utils.log_usage(folder, model_name, response.usage_metadata)
data = json.loads(utils.clean_json(response.text)) data = json.loads(utils.clean_json(response.text))
@@ -86,8 +86,8 @@ def check_pacing(bp, summary, last_chapter_text, last_chapter_data, remaining_ch
TASK: Analyze pacing. TASK: Analyze pacing.
CONTEXT: CONTEXT:
- PREVIOUS_SUMMARY: {summary[-3000:]} - PREVIOUS_SUMMARY: {utils.truncate_to_tokens(summary, 1000)}
- CURRENT_CHAPTER: {last_chapter_text[-2000:]} - CURRENT_CHAPTER: {utils.truncate_to_tokens(last_chapter_text, 800)}
- UPCOMING: {json.dumps([c['title'] for c in remaining_chapters[:3]])} - UPCOMING: {json.dumps([c['title'] for c in remaining_chapters[:3]])}
- REMAINING_COUNT: {len(remaining_chapters)} - REMAINING_COUNT: {len(remaining_chapters)}
@@ -254,7 +254,7 @@ def check_and_propagate(bp, manuscript, changed_chap_num, folder, change_summary
TASK: Summarize the key events and ending state of this chapter for continuity tracking. TASK: Summarize the key events and ending state of this chapter for continuity tracking.
TEXT: TEXT:
{changed_chap.get('content', '')[:10000]} {utils.truncate_to_tokens(changed_chap.get('content', ''), 2500)}
FOCUS: FOCUS:
- Major plot points. - Major plot points.
@@ -350,7 +350,7 @@ def check_and_propagate(bp, manuscript, changed_chap_num, folder, change_summary
CHANGE_SUMMARY: {current_context} CHANGE_SUMMARY: {current_context}
CHAPTER_TO_CHECK (Ch {target_chap['num']}): CHAPTER_TO_CHECK (Ch {target_chap['num']}):
{target_chap['content'][:12000]} {utils.truncate_to_tokens(target_chap['content'], 3000)}
DECISION_LOGIC: DECISION_LOGIC:
- If the chapter directly contradicts the change (references dead characters, items that no longer exist, events that didn't happen), status = REWRITE. - If the chapter directly contradicts the change (references dead characters, items that no longer exist, events that didn't happen), status = REWRITE.

View File

@@ -71,7 +71,7 @@ def write_chapter(chap, bp, folder, prev_sum, tracking=None, prev_content=None,
prev_context_block = "" prev_context_block = ""
if prev_content: if prev_content:
trunc_content = prev_content[-3000:] if len(prev_content) > 3000 else prev_content trunc_content = utils.truncate_to_tokens(prev_content, 2000)
prev_context_block = f"\nPREVIOUS CHAPTER TEXT (For Tone & Continuity):\n{trunc_content}\n" prev_context_block = f"\nPREVIOUS CHAPTER TEXT (For Tone & Continuity):\n{trunc_content}\n"
chars_for_writer = [ chars_for_writer = [
@@ -238,7 +238,7 @@ def write_chapter(chap, bp, folder, prev_sum, tracking=None, prev_content=None,
HARD_CONSTRAINTS: HARD_CONSTRAINTS:
- TARGET_WORDS: ~{est_words} words (aim for this; ±20% is acceptable if the scene genuinely demands it — but do not condense beats to save space) - TARGET_WORDS: ~{est_words} words (aim for this; ±20% is acceptable if the scene genuinely demands it — but do not condense beats to save space)
- BEATS MUST BE COVERED: {json.dumps(chap.get('beats', []))} - BEATS MUST BE COVERED: {json.dumps(chap.get('beats', []))}
- SUMMARY CONTEXT: {prev_sum[:1500]} - SUMMARY CONTEXT: {utils.truncate_to_tokens(prev_sum, 600)}
AUTHOR_VOICE: AUTHOR_VOICE:
{persona_info} {persona_info}

View File

@@ -103,4 +103,4 @@ if __name__ == "__main__":
t = threading.Thread(target=run_huey, daemon=True) t = threading.Thread(target=run_huey, daemon=True)
t.start() t.start()
app.run(host='0.0.0.0', port=7070, debug=False) app.run(host='0.0.0.0', port=5000, debug=False)

View File

@@ -112,6 +112,19 @@ def analyze_persona():
data = request.json data = request.json
sample = data.get('sample_text', '') sample = data.get('sample_text', '')
# Cache by a hash of the inputs to avoid redundant API calls for unchanged data
cache_key = utils.make_cache_key(
"persona_analyze",
data.get('name', ''),
data.get('age', ''),
data.get('gender', ''),
data.get('nationality', ''),
sample[:500]
)
cached = utils.get_ai_cache(cache_key)
if cached:
return cached
prompt = f""" prompt = f"""
ROLE: Literary Analyst ROLE: Literary Analyst
TASK: Create or analyze an Author Persona profile. TASK: Create or analyze an Author Persona profile.
@@ -119,7 +132,7 @@ def analyze_persona():
INPUT_DATA: INPUT_DATA:
- NAME: {data.get('name')} - NAME: {data.get('name')}
- DEMOGRAPHICS: Age: {data.get('age')} | Gender: {data.get('gender')} | Nationality: {data.get('nationality')} - DEMOGRAPHICS: Age: {data.get('age')} | Gender: {data.get('gender')} | Nationality: {data.get('nationality')}
- SAMPLE_TEXT: {sample[:3000]} - SAMPLE_TEXT: {utils.truncate_to_tokens(sample, 750)}
INSTRUCTIONS: INSTRUCTIONS:
1. BIO: Write a 2-3 sentence description of the writing style. If sample is provided, analyze it. If not, invent a style that fits the demographics/name. 1. BIO: Write a 2-3 sentence description of the writing style. If sample is provided, analyze it. If not, invent a style that fits the demographics/name.
@@ -130,6 +143,8 @@ def analyze_persona():
""" """
try: try:
response = ai_models.model_logic.generate_content(prompt) response = ai_models.model_logic.generate_content(prompt)
return json.loads(utils.clean_json(response.text)) result = json.loads(utils.clean_json(response.text))
utils.set_ai_cache(cache_key, result)
return result
except Exception as e: except Exception as e:
return {"error": str(e)}, 500 return {"error": str(e)}, 500