From ff5093a5f9cee53c0c066b5d5b6acc22d66488e3 Mon Sep 17 00:00:00 2001
From: Mike Wichers <thethreemagi@outlook.com>
Date: Sun, 22 Feb 2026 22:31:22 -0500
Subject: [PATCH] =?UTF-8?q?fix:=20Pipeline=20hardening=20=E2=80=94=20error?=
 =?UTF-8?q?=20handling,=20token=20efficiency,=20and=20robustness?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

core/utils.py:
- estimate_tokens: improved heuristic 4 chars/token → 3.5 chars/token (more accurate)
- truncate_to_tokens: added keep_head=True mode for head+tail truncation (better
  context retention for story summaries that need both opening and recent content)
- load_json: explicit exception handling (json.JSONDecodeError, OSError) with log
  instead of silent returns; added utf-8 encoding with error replacement
- log_image_attempt: replaced bare except with (json.JSONDecodeError, OSError);
  added utf-8 encoding to output write
- log_usage: replaced bare except with AttributeError for token count extraction

story/bible_tracker.py:
- merge_selected_changes: wrapped all int() key casts (char idx, book num, beat idx)
  in try/except with meaningful log warning instead of crashing on malformed keys
- harvest_metadata: replaced bare except:pass with except Exception as e + log message

cli/engine.py:
- Persona validation: added warning when all 3 attempts fail and substandard persona
  is accepted — flags elevated voice-drift risk for the run
- Lore index updates: throttled from every chapter to every 3 chapters; lore is
  stable after the first few chapters (~10% token saving per book)
- Mid-gen consistency check: now samples first 2 + last 8 chapters instead of passing
  full manuscript — caps token cost regardless of book length

story/writer.py:
- Two-pass polish: added local filter-word density check (no API call); skips the
  Pro polish if density < 1 per 83 words — saves ~8K tokens on already-clean drafts
- Polish prompt: added prev_context_block for continuity — polished chapter now
  maintains seamless flow from the previous chapter's ending

marketing/fonts.py:
- Separated requests.exceptions.Timeout with specific log message vs generic failure
- Added explicit log message when Roboto fallback also fails (returns None)

marketing/blurb.py:
- Added word count trim: blurbs > 220 words trimmed to last sentence within 220 words
- Changed bare except to except Exception as e with log message
- Added utf-8 encoding to file writes; logs final word count

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 cli/engine.py          | 13 ++++++++----
 core/utils.py          | 45 ++++++++++++++++++++++++++++++------------
 marketing/blurb.py     | 26 +++++++++++++++++++-----
 marketing/fonts.py     | 14 +++++++++----
 story/bible_tracker.py | 21 ++++++++++++++++----
 story/writer.py        | 23 +++++++++++++++------
 6 files changed, 106 insertions(+), 36 deletions(-)

diff --git a/cli/engine.py b/cli/engine.py
index d45e327..56e1a90 100644
--- a/cli/engine.py
+++ b/cli/engine.py
@@ -57,6 +57,8 @@ def process_book(bp, folder, context="", resume=False, interactive=False):
                     candidate_persona = style_persona.create_initial_persona(bp, folder)
                     is_valid, p_score = style_persona.validate_persona(bp, candidate_persona, folder)
                     if is_valid or persona_attempt == max_persona_attempts:
+                        if not is_valid:
+                            utils.log("SYSTEM", f"  ⚠️ Persona accepted after {max_persona_attempts} attempts despite low score ({p_score}/10). Voice drift risk elevated.")
                         bp['book_metadata']['author_details'] = candidate_persona
                         break
                     utils.log("SYSTEM", f"  -> Persona attempt {persona_attempt}/{max_persona_attempts} scored {p_score}/10. Regenerating...")
@@ -268,18 +270,21 @@ def process_book(bp, folder, context="", resume=False, interactive=False):
             with open(chars_track_path, "w") as f: json.dump(tracking['characters'], f, indent=2)
             with open(warn_track_path, "w") as f: json.dump(tracking.get('content_warnings', []), f, indent=2)
 
-            # Update Lore Index (Item 8: RAG-Lite)
-            tracking['lore'] = bible_tracker.update_lore_index(folder, txt, tracking.get('lore', {}))
-            with open(lore_track_path, "w") as f: json.dump(tracking['lore'], f, indent=2)
+            # Update Lore Index (Item 8: RAG-Lite) — every 3 chapters (lore is stable after ch 1-3)
+            if i == 0 or i % 3 == 0:
+                tracking['lore'] = bible_tracker.update_lore_index(folder, txt, tracking.get('lore', {}))
+                with open(lore_track_path, "w") as f: json.dump(tracking['lore'], f, indent=2)
 
             # Update Structured Story State (Item 9: Thread Tracking)
             current_story_state = story_state.update_story_state(txt, ch['chapter_number'], current_story_state, folder)
 
             # Exp 5: Mid-gen Consistency Snapshot (every 10 chapters)
+            # Sample: first 2 + last 8 chapters to keep token cost bounded regardless of book length
             if len(ms) > 0 and len(ms) % 10 == 0:
                 utils.log("EDITOR", f"--- Mid-gen consistency check after chapter {ch['chapter_number']} ({len(ms)} written) ---")
                 try:
-                    consistency = story_editor.analyze_consistency(bp, ms, folder)
+                    ms_sample = (ms[:2] + ms[-8:]) if len(ms) > 10 else ms
+                    consistency = story_editor.analyze_consistency(bp, ms_sample, folder)
                     issues = consistency.get('issues', [])
                     if issues:
                         for issue in issues:
diff --git a/core/utils.py b/core/utils.py
index 78ab0bb..0cb0208 100644
--- a/core/utils.py
+++ b/core/utils.py
@@ -23,18 +23,27 @@ PRICING_CACHE = {}
 # --- Token Estimation & Truncation Utilities ---
 
 def estimate_tokens(text):
-    """Estimate token count using a 4-chars-per-token heuristic (no external libs required)."""
+    """Estimate token count using a 3.5-chars-per-token heuristic (more accurate than /4)."""
     if not text:
         return 0
-    return max(1, len(text) // 4)
+    return max(1, int(len(text) / 3.5))
 
-def truncate_to_tokens(text, max_tokens):
-    """Truncate text to approximately max_tokens, keeping the most recent (tail) content."""
+def truncate_to_tokens(text, max_tokens, keep_head=False):
+    """Truncate text to approximately max_tokens.
+
+    keep_head=False (default): keep the most recent (tail) content — good for 'story so far'.
+    keep_head=True: keep first third + last two thirds — good for context that needs both
+                    the opening framing and the most recent events.
+    """
     if not text:
         return text
-    max_chars = max_tokens * 4
+    max_chars = int(max_tokens * 3.5)
     if len(text) <= max_chars:
         return text
+    if keep_head:
+        head_chars = max_chars // 3
+        tail_chars = max_chars - head_chars
+        return text[:head_chars] + "\n[...]\n" + text[-tail_chars:]
     return text[-max_chars:]
 
 # --- In-Memory AI Response Cache ---
@@ -126,7 +135,14 @@ def log(phase, msg):
         except: pass
 
 def load_json(path):
-    return json.load(open(path, 'r')) if os.path.exists(path) else None
+    if not os.path.exists(path):
+        return None
+    try:
+        with open(path, 'r', encoding='utf-8', errors='replace') as f:
+            return json.load(f)
+    except (json.JSONDecodeError, OSError, ValueError) as e:
+        log("SYSTEM", f"⚠️ Failed to load JSON from {path}: {e}")
+        return None
 
 def create_default_personas():
     # Persona data is now stored in the Persona DB table; ensure the directory exists for sample files.
@@ -153,11 +169,13 @@ def log_image_attempt(folder, img_type, prompt, filename, status, error=None, sc
     data = []
     if os.path.exists(log_path):
         try:
-            with open(log_path, 'r') as f: data = json.load(f)
-        except:
-            pass
+            with open(log_path, 'r', encoding='utf-8') as f:
+                data = json.load(f)
+        except (json.JSONDecodeError, OSError):
+            data = []  # Corrupted log — start fresh rather than crash
     data.append(entry)
-    with open(log_path, 'w') as f: json.dump(data, f, indent=2)
+    with open(log_path, 'w', encoding='utf-8') as f:
+        json.dump(data, f, indent=2)
 
 def get_run_folder(base_name):
     if not os.path.exists(base_name): os.makedirs(base_name)
@@ -218,9 +236,10 @@ def log_usage(folder, model_label, usage_metadata=None, image_count=0):
 
     if usage_metadata:
         try:
-            input_tokens = usage_metadata.prompt_token_count
-            output_tokens = usage_metadata.candidates_token_count
-        except: pass
+            input_tokens = usage_metadata.prompt_token_count or 0
+            output_tokens = usage_metadata.candidates_token_count or 0
+        except AttributeError:
+            pass  # usage_metadata shape varies by model; tokens stay 0
 
     cost = calculate_cost(model_label, input_tokens, output_tokens, image_count)
 
diff --git a/marketing/blurb.py b/marketing/blurb.py
index 0787167..3640c0e 100644
--- a/marketing/blurb.py
+++ b/marketing/blurb.py
@@ -44,8 +44,24 @@ def generate_blurb(bp, folder):
     try:
         response = ai_models.model_writer.generate_content(prompt)
         utils.log_usage(folder, ai_models.model_writer.name, response.usage_metadata)
-        blurb = response.text
-        with open(os.path.join(folder, "blurb.txt"), "w") as f: f.write(blurb)
-        with open(os.path.join(folder, "back_cover.txt"), "w") as f: f.write(blurb)
-    except:
-        utils.log("MARKETING", "Failed to generate blurb.")
+        blurb = response.text.strip()
+
+        # Trim to 220 words if model overshot the 150-200 word target
+        words = blurb.split()
+        if len(words) > 220:
+            blurb = " ".join(words[:220])
+            # End at the last sentence boundary within those 220 words
+            for end_ch in ['.', '!', '?']:
+                last_sent = blurb.rfind(end_ch)
+                if last_sent > len(blurb) // 2:
+                    blurb = blurb[:last_sent + 1]
+                    break
+            utils.log("MARKETING", f"  -> Blurb trimmed to {len(blurb.split())} words.")
+
+        with open(os.path.join(folder, "blurb.txt"), "w", encoding='utf-8') as f:
+            f.write(blurb)
+        with open(os.path.join(folder, "back_cover.txt"), "w", encoding='utf-8') as f:
+            f.write(blurb)
+        utils.log("MARKETING", f"  -> Blurb: {len(blurb.split())} words.")
+    except Exception as e:
+        utils.log("MARKETING", f"Failed to generate blurb: {e}")
diff --git a/marketing/fonts.py b/marketing/fonts.py
index cc7e983..6fc0b71 100644
--- a/marketing/fonts.py
+++ b/marketing/fonts.py
@@ -42,14 +42,20 @@ def download_font(font_name):
         base_url = f"https://github.com/google/fonts/raw/main/{license_type}/{clean_name}"
         for pattern in patterns:
             try:
-                r = requests.get(f"{base_url}/{pattern}", headers=headers, timeout=5)
+                r = requests.get(f"{base_url}/{pattern}", headers=headers, timeout=6)
                 if r.status_code == 200 and len(r.content) > 1000:
-                    with open(font_path, 'wb') as f: f.write(r.content)
+                    with open(font_path, 'wb') as f:
+                        f.write(r.content)
                     utils.log("ASSETS", f"✅ Downloaded {font_name} to {font_path}")
                     return font_path
-            except Exception: continue
+            except requests.exceptions.Timeout:
+                utils.log("ASSETS", f"  Font download timeout for {font_name} ({pattern}). Trying next...")
+                continue
+            except Exception:
+                continue
 
     if clean_name != "roboto":
-        utils.log("ASSETS", f"⚠️ Font '{font_name}' not found. Falling back to Roboto.")
+        utils.log("ASSETS", f"⚠️ Font '{font_name}' not found on Google Fonts. Falling back to Roboto.")
         return download_font("Roboto")
+    utils.log("ASSETS", "⚠️ Roboto fallback also failed. PIL will use built-in default font.")
     return None
diff --git a/story/bible_tracker.py b/story/bible_tracker.py
index b0d4736..1ff47ce 100644
--- a/story/bible_tracker.py
+++ b/story/bible_tracker.py
@@ -19,7 +19,11 @@ def merge_selected_changes(original, draft, selected_keys):
                 original['project_metadata'][field] = draft['project_metadata'][field]
 
         elif parts[0] == 'char' and len(parts) >= 2:
-            idx = int(parts[1])
+            try:
+                idx = int(parts[1])
+            except (ValueError, IndexError):
+                utils.log("SYSTEM", f"⚠️ Skipping malformed bible merge key: '{key}'")
+                continue
             if idx < len(draft['characters']):
                 if idx < len(original['characters']):
                     original['characters'][idx] = draft['characters'][idx]
@@ -27,7 +31,11 @@ def merge_selected_changes(original, draft, selected_keys):
                     original['characters'].append(draft['characters'][idx])
 
         elif parts[0] == 'book' and len(parts) >= 2:
-            book_num = int(parts[1])
+            try:
+                book_num = int(parts[1])
+            except (ValueError, IndexError):
+                utils.log("SYSTEM", f"⚠️ Skipping malformed bible merge key: '{key}'")
+                continue
             orig_book = next((b for b in original['books'] if b['book_number'] == book_num), None)
             draft_book = next((b for b in draft['books'] if b['book_number'] == book_num), None)
 
@@ -42,7 +50,11 @@ def merge_selected_changes(original, draft, selected_keys):
                     orig_book['manual_instruction'] = draft_book['manual_instruction']
 
                 elif len(parts) == 4 and parts[2] == 'beat':
-                    beat_idx = int(parts[3])
+                    try:
+                        beat_idx = int(parts[3])
+                    except (ValueError, IndexError):
+                        utils.log("SYSTEM", f"⚠️ Skipping malformed beat merge key: '{key}'")
+                        continue
                     if beat_idx < len(draft_book['plot_beats']):
                         while len(orig_book['plot_beats']) <= beat_idx:
                             orig_book['plot_beats'].append("")
@@ -153,7 +165,8 @@ def harvest_metadata(bp, folder, full_manuscript):
             if valid_chars:
                 utils.log("HARVESTER", f"Found {len(valid_chars)} new chars.")
                 bp['characters'].extend(valid_chars)
-    except: pass
+    except Exception as e:
+        utils.log("HARVESTER", f"⚠️ Metadata harvest failed: {e}")
     return bp
 
 
diff --git a/story/writer.py b/story/writer.py
index 8f60969..24c39be 100644
--- a/story/writer.py
+++ b/story/writer.py
@@ -362,12 +362,18 @@ def write_chapter(chap, bp, folder, prev_sum, tracking=None, prev_content=None,
         utils.log("WRITER", f"⚠️ Failed Ch {chap['chapter_number']}: {e}")
         return f"## Chapter {chap['chapter_number']} Failed\n\nError: {e}"
 
-    # Exp 7: Two-Pass Drafting — Polish the rough draft with the logic (Pro) model
-    # before evaluation. Produces cleaner prose with fewer rewrite cycles.
-    if current_text:
-        utils.log("WRITER", f"  -> Two-pass polish (Pro model)...")
-        guidelines = get_style_guidelines()
-        fw_list = '", "'.join(guidelines['filter_words'])
+    # Exp 7: Two-Pass Drafting — Polish rough draft with the logic (Pro) model before evaluation.
+    # Skip when local filter-word heuristic shows draft is already clean (saves ~8K tokens/chapter).
+    _guidelines_for_polish = get_style_guidelines()
+    _fw_set = set(_guidelines_for_polish['filter_words'])
+    _draft_word_list = current_text.lower().split() if current_text else []
+    _fw_hit_count = sum(1 for w in _draft_word_list if w in _fw_set)
+    _fw_density = _fw_hit_count / max(len(_draft_word_list), 1)
+    _skip_polish = _fw_density < 0.012  # < ~1 filter word per 83 words → draft already clean
+
+    if current_text and not _skip_polish:
+        utils.log("WRITER", f"  -> Two-pass polish (Pro model, FW density {_fw_density:.3f})...")
+        fw_list = '", "'.join(_guidelines_for_polish['filter_words'])
         polish_prompt = f"""
         ROLE: Senior Fiction Editor
         TASK: Polish this rough draft into publication-ready prose.
@@ -379,6 +385,9 @@ def write_chapter(chap, bp, folder, prev_sum, tracking=None, prev_content=None,
         TARGET_WORDS: ~{est_words}
         BEATS (must all be covered): {json.dumps(chap.get('beats', []))}
 
+        CONTINUITY (maintain seamless flow from previous chapter):
+        {prev_context_block if prev_context_block else "First chapter — no prior context."}
+
         POLISH_CHECKLIST:
         1. FILTER_REMOVAL: Remove all filter words [{fw_list}] — rewrite each to show the sensation directly.
         2. DEEP_POV: Ensure the reader is inside the POV character's experience at all times — no external narration.
@@ -404,6 +413,8 @@ def write_chapter(chap, bp, folder, prev_sum, tracking=None, prev_content=None,
                 current_text = polished
         except Exception as e:
             utils.log("WRITER", f"  -> Polish pass failed: {e}. Proceeding with raw draft.")
+    elif current_text:
+        utils.log("WRITER", f"  -> Draft clean (FW density {_fw_density:.3f}). Skipping polish pass.")
 
     # Reduced from 3 → 2 attempts since polish pass already refines prose before evaluation
     max_attempts = 2