diff --git a/README.md b/README.md
index 89ace00..82eca44 100644
--- a/README.md
+++ b/README.md
@@ -125,6 +125,12 @@ Open `http://localhost:5000`.
 - **Resilient Model Wrapper:** Wraps every Gemini API call with up to 3 retries and exponential backoff, handles quota errors and rate limits, and can switch to an alternative model mid-stream.
 - **Auto Model Selection:** On startup, a bootstrapper model queries the Gemini API and selects the optimal models for Logic, Writer, Artist, and Image roles. Selection is cached for 24 hours.
 - **Vertex AI Support:** If `GCP_PROJECT` is set and OAuth credentials are present, initializes Vertex AI automatically for Imagen image generation.
+- **Payload Guardrails:** Every generation call estimates the prompt token count before dispatch. If the payload exceeds 30,000 tokens, a warning is logged so runaway context injection is surfaced immediately.
+
+### AI Context Optimization (`core/utils.py`)
+- **Token Estimation:** `estimate_tokens(text)` provides a fast character-based token count approximation (`len(text) / 4`) without requiring external tokenizer libraries.
+- **Context Truncation:** `truncate_to_tokens(text, max_tokens)` enforces hard caps on large context variables — previous chapter text, story summaries, and character data — before they are injected into prompts, preventing token overflows on large manuscripts.
+- **AI Response Cache:** An in-memory cache (`_AI_CACHE`) keyed by MD5 hash of inputs prevents redundant API calls for deterministic tasks such as persona analysis. Results are reused for identical inputs within the same session.
 
 ### Cost Tracking
 Every AI call logs input/output token counts and estimated USD cost (using cached pricing per model). Cumulative project cost is stored in the database and displayed per user and per run.
diff --git a/ai/models.py b/ai/models.py
index 0449728..6018f57 100644
--- a/ai/models.py
+++ b/ai/models.py
@@ -45,7 +45,21 @@ class ResilientModel:
         self.name = name
         self.model = genai.GenerativeModel(name, safety_settings=self.safety_settings)
 
+    _TOKEN_WARN_LIMIT = 30_000
+
     def generate_content(self, *args, **kwargs):
+        # Estimate payload size and warn if it exceeds the safe limit
+        if args:
+            payload = args[0]
+            if isinstance(payload, str):
+                est = utils.estimate_tokens(payload)
+            elif isinstance(payload, list):
+                est = sum(utils.estimate_tokens(p) if isinstance(p, str) else 0 for p in payload)
+            else:
+                est = 0
+            if est > self._TOKEN_WARN_LIMIT:
+                utils.log("SYSTEM", f"⚠️ Payload warning: ~{est:,} tokens for {self.role} ({self.name}). Consider reducing context.")
+
         retries = 0
         max_retries = 3
         base_delay = 5
diff --git a/core/utils.py b/core/utils.py
index 3a55134..e213a9c 100644
--- a/core/utils.py
+++ b/core/utils.py
@@ -2,6 +2,7 @@ import os
 import json
 import datetime
 import time
+import hashlib
 from core import config
 import threading
 import re
@@ -19,6 +20,40 @@ _log_context = threading.local()
 # Cache for dynamic pricing from AI model selection
 PRICING_CACHE = {}
 
+# --- Token Estimation & Truncation Utilities ---
+
+def estimate_tokens(text):
+    """Estimate token count using a 4-chars-per-token heuristic (no external libs required)."""
+    if not text:
+        return 0
+    return max(1, len(text) // 4)
+
+def truncate_to_tokens(text, max_tokens):
+    """Truncate text to approximately max_tokens, keeping the most recent (tail) content."""
+    if not text:
+        return text
+    max_chars = max_tokens * 4
+    if len(text) <= max_chars:
+        return text
+    return text[-max_chars:]
+
+# --- In-Memory AI Response Cache ---
+
+_AI_CACHE = {}
+
+def get_ai_cache(key):
+    """Retrieve a cached AI response by key. Returns None if not cached."""
+    return _AI_CACHE.get(key)
+
+def set_ai_cache(key, value):
+    """Store an AI response in the in-memory cache keyed by a hash string."""
+    _AI_CACHE[key] = value
+
+def make_cache_key(prefix, *parts):
+    """Build a stable MD5 cache key from a prefix and variable string parts."""
+    raw = "|".join(str(p) for p in parts)
+    return f"{prefix}:{hashlib.md5(raw.encode('utf-8', errors='replace')).hexdigest()}"
+
 def set_log_file(filepath):
     _log_context.log_file = filepath
 
diff --git a/story/editor.py b/story/editor.py
index 19bfdfa..a51e915 100644
--- a/story/editor.py
+++ b/story/editor.py
@@ -59,7 +59,7 @@ def evaluate_chapter_quality(text, chapter_title, genre, model, folder):
     }}
     """
     try:
-        response = model.generate_content([prompt, text[:30000]])
+        response = model.generate_content([prompt, utils.truncate_to_tokens(text, 7500)])
         model_name = getattr(model, 'name', ai_models.logic_model_name)
         utils.log_usage(folder, model_name, response.usage_metadata)
         data = json.loads(utils.clean_json(response.text))
@@ -86,8 +86,8 @@ def check_pacing(bp, summary, last_chapter_text, last_chapter_data, remaining_ch
     TASK: Analyze pacing.
 
     CONTEXT:
-    - PREVIOUS_SUMMARY: {summary[-3000:]}
-    - CURRENT_CHAPTER: {last_chapter_text[-2000:]}
+    - PREVIOUS_SUMMARY: {utils.truncate_to_tokens(summary, 1000)}
+    - CURRENT_CHAPTER: {utils.truncate_to_tokens(last_chapter_text, 800)}
     - UPCOMING: {json.dumps([c['title'] for c in remaining_chapters[:3]])}
     - REMAINING_COUNT: {len(remaining_chapters)}
 
@@ -254,7 +254,7 @@ def check_and_propagate(bp, manuscript, changed_chap_num, folder, change_summary
         TASK: Summarize the key events and ending state of this chapter for continuity tracking.
 
         TEXT:
-        {changed_chap.get('content', '')[:10000]}
+        {utils.truncate_to_tokens(changed_chap.get('content', ''), 2500)}
 
         FOCUS:
         - Major plot points.
@@ -350,7 +350,7 @@ def check_and_propagate(bp, manuscript, changed_chap_num, folder, change_summary
         CHANGE_SUMMARY: {current_context}
 
         CHAPTER_TO_CHECK (Ch {target_chap['num']}):
-        {target_chap['content'][:12000]}
+        {utils.truncate_to_tokens(target_chap['content'], 3000)}
 
         DECISION_LOGIC:
         - If the chapter directly contradicts the change (references dead characters, items that no longer exist, events that didn't happen), status = REWRITE.
diff --git a/story/writer.py b/story/writer.py
index c143c89..4145d3a 100644
--- a/story/writer.py
+++ b/story/writer.py
@@ -71,7 +71,7 @@ def write_chapter(chap, bp, folder, prev_sum, tracking=None, prev_content=None,
 
     prev_context_block = ""
     if prev_content:
-        trunc_content = prev_content[-3000:] if len(prev_content) > 3000 else prev_content
+        trunc_content = utils.truncate_to_tokens(prev_content, 2000)
         prev_context_block = f"\nPREVIOUS CHAPTER TEXT (For Tone & Continuity):\n{trunc_content}\n"
 
     chars_for_writer = [
@@ -238,7 +238,7 @@ def write_chapter(chap, bp, folder, prev_sum, tracking=None, prev_content=None,
         HARD_CONSTRAINTS:
         - TARGET_WORDS: ~{est_words} words (aim for this; ±20% is acceptable if the scene genuinely demands it — but do not condense beats to save space)
         - BEATS MUST BE COVERED: {json.dumps(chap.get('beats', []))}
-        - SUMMARY CONTEXT: {prev_sum[:1500]}
+        - SUMMARY CONTEXT: {utils.truncate_to_tokens(prev_sum, 600)}
 
         AUTHOR_VOICE:
         {persona_info}
diff --git a/web/routes/persona.py b/web/routes/persona.py
index dc3e5dc..20c5353 100644
--- a/web/routes/persona.py
+++ b/web/routes/persona.py
@@ -112,6 +112,19 @@ def analyze_persona():
     data = request.json
     sample = data.get('sample_text', '')
 
+    # Cache by a hash of the inputs to avoid redundant API calls for unchanged data
+    cache_key = utils.make_cache_key(
+        "persona_analyze",
+        data.get('name', ''),
+        data.get('age', ''),
+        data.get('gender', ''),
+        data.get('nationality', ''),
+        sample[:500]
+    )
+    cached = utils.get_ai_cache(cache_key)
+    if cached:
+        return cached
+
     prompt = f"""
     ROLE: Literary Analyst
     TASK: Create or analyze an Author Persona profile.
@@ -119,7 +132,7 @@ def analyze_persona():
     INPUT_DATA:
     - NAME: {data.get('name')}
     - DEMOGRAPHICS: Age: {data.get('age')} | Gender: {data.get('gender')} | Nationality: {data.get('nationality')}
-    - SAMPLE_TEXT: {sample[:3000]}
+    - SAMPLE_TEXT: {utils.truncate_to_tokens(sample, 750)}
 
     INSTRUCTIONS:
     1. BIO: Write a 2-3 sentence description of the writing style. If sample is provided, analyze it. If not, invent a style that fits the demographics/name.
@@ -130,6 +143,8 @@ def analyze_persona():
     """
     try:
         response = ai_models.model_logic.generate_content(prompt)
-        return json.loads(utils.clean_json(response.text))
+        result = json.loads(utils.clean_json(response.text))
+        utils.set_ai_cache(cache_key, result)
+        return result
     except Exception as e:
         return {"error": str(e)}, 500