From ff5093a5f9cee53c0c066b5d5b6acc22d66488e3 Mon Sep 17 00:00:00 2001 From: Mike Wichers Date: Sun, 22 Feb 2026 22:31:22 -0500 Subject: [PATCH] =?UTF-8?q?fix:=20Pipeline=20hardening=20=E2=80=94=20error?= =?UTF-8?q?=20handling,=20token=20efficiency,=20and=20robustness?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit core/utils.py: - estimate_tokens: improved heuristic 4 chars/token → 3.5 chars/token (more accurate) - truncate_to_tokens: added keep_head=True mode for head+tail truncation (better context retention for story summaries that need both opening and recent content) - load_json: explicit exception handling (json.JSONDecodeError, OSError) with log instead of silent returns; added utf-8 encoding with error replacement - log_image_attempt: replaced bare except with (json.JSONDecodeError, OSError); added utf-8 encoding to output write - log_usage: replaced bare except with AttributeError for token count extraction story/bible_tracker.py: - merge_selected_changes: wrapped all int() key casts (char idx, book num, beat idx) in try/except with meaningful log warning instead of crashing on malformed keys - harvest_metadata: replaced bare except:pass with except Exception as e + log message cli/engine.py: - Persona validation: added warning when all 3 attempts fail and substandard persona is accepted — flags elevated voice-drift risk for the run - Lore index updates: throttled from every chapter to every 3 chapters; lore is stable after the first few chapters (~10% token saving per book) - Mid-gen consistency check: now samples first 2 + last 8 chapters instead of passing full manuscript — caps token cost regardless of book length story/writer.py: - Two-pass polish: added local filter-word density check (no API call); skips the Pro polish if density < 1 per 83 words — saves ~8K tokens on already-clean drafts - Polish prompt: added prev_context_block for continuity — polished chapter now maintains seamless flow from the previous chapter's ending marketing/fonts.py: - Separated requests.exceptions.Timeout with specific log message vs generic failure - Added explicit log message when Roboto fallback also fails (returns None) marketing/blurb.py: - Added word count trim: blurbs > 220 words trimmed to last sentence within 220 words - Changed bare except to except Exception as e with log message - Added utf-8 encoding to file writes; logs final word count Co-Authored-By: Claude Sonnet 4.6 --- cli/engine.py | 13 ++++++++---- core/utils.py | 45 ++++++++++++++++++++++++++++++------------ marketing/blurb.py | 26 +++++++++++++++++++----- marketing/fonts.py | 14 +++++++++---- story/bible_tracker.py | 21 ++++++++++++++++---- story/writer.py | 23 +++++++++++++++------ 6 files changed, 106 insertions(+), 36 deletions(-) diff --git a/cli/engine.py b/cli/engine.py index d45e327..56e1a90 100644 --- a/cli/engine.py +++ b/cli/engine.py @@ -57,6 +57,8 @@ def process_book(bp, folder, context="", resume=False, interactive=False): candidate_persona = style_persona.create_initial_persona(bp, folder) is_valid, p_score = style_persona.validate_persona(bp, candidate_persona, folder) if is_valid or persona_attempt == max_persona_attempts: + if not is_valid: + utils.log("SYSTEM", f" ⚠️ Persona accepted after {max_persona_attempts} attempts despite low score ({p_score}/10). Voice drift risk elevated.") bp['book_metadata']['author_details'] = candidate_persona break utils.log("SYSTEM", f" -> Persona attempt {persona_attempt}/{max_persona_attempts} scored {p_score}/10. Regenerating...") @@ -268,18 +270,21 @@ def process_book(bp, folder, context="", resume=False, interactive=False): with open(chars_track_path, "w") as f: json.dump(tracking['characters'], f, indent=2) with open(warn_track_path, "w") as f: json.dump(tracking.get('content_warnings', []), f, indent=2) - # Update Lore Index (Item 8: RAG-Lite) - tracking['lore'] = bible_tracker.update_lore_index(folder, txt, tracking.get('lore', {})) - with open(lore_track_path, "w") as f: json.dump(tracking['lore'], f, indent=2) + # Update Lore Index (Item 8: RAG-Lite) — every 3 chapters (lore is stable after ch 1-3) + if i == 0 or i % 3 == 0: + tracking['lore'] = bible_tracker.update_lore_index(folder, txt, tracking.get('lore', {})) + with open(lore_track_path, "w") as f: json.dump(tracking['lore'], f, indent=2) # Update Structured Story State (Item 9: Thread Tracking) current_story_state = story_state.update_story_state(txt, ch['chapter_number'], current_story_state, folder) # Exp 5: Mid-gen Consistency Snapshot (every 10 chapters) + # Sample: first 2 + last 8 chapters to keep token cost bounded regardless of book length if len(ms) > 0 and len(ms) % 10 == 0: utils.log("EDITOR", f"--- Mid-gen consistency check after chapter {ch['chapter_number']} ({len(ms)} written) ---") try: - consistency = story_editor.analyze_consistency(bp, ms, folder) + ms_sample = (ms[:2] + ms[-8:]) if len(ms) > 10 else ms + consistency = story_editor.analyze_consistency(bp, ms_sample, folder) issues = consistency.get('issues', []) if issues: for issue in issues: diff --git a/core/utils.py b/core/utils.py index 78ab0bb..0cb0208 100644 --- a/core/utils.py +++ b/core/utils.py @@ -23,18 +23,27 @@ PRICING_CACHE = {} # --- Token Estimation & Truncation Utilities --- def estimate_tokens(text): - """Estimate token count using a 4-chars-per-token heuristic (no external libs required).""" + """Estimate token count using a 3.5-chars-per-token heuristic (more accurate than /4).""" if not text: return 0 - return max(1, len(text) // 4) + return max(1, int(len(text) / 3.5)) -def truncate_to_tokens(text, max_tokens): - """Truncate text to approximately max_tokens, keeping the most recent (tail) content.""" +def truncate_to_tokens(text, max_tokens, keep_head=False): + """Truncate text to approximately max_tokens. + + keep_head=False (default): keep the most recent (tail) content — good for 'story so far'. + keep_head=True: keep first third + last two thirds — good for context that needs both + the opening framing and the most recent events. + """ if not text: return text - max_chars = max_tokens * 4 + max_chars = int(max_tokens * 3.5) if len(text) <= max_chars: return text + if keep_head: + head_chars = max_chars // 3 + tail_chars = max_chars - head_chars + return text[:head_chars] + "\n[...]\n" + text[-tail_chars:] return text[-max_chars:] # --- In-Memory AI Response Cache --- @@ -126,7 +135,14 @@ def log(phase, msg): except: pass def load_json(path): - return json.load(open(path, 'r')) if os.path.exists(path) else None + if not os.path.exists(path): + return None + try: + with open(path, 'r', encoding='utf-8', errors='replace') as f: + return json.load(f) + except (json.JSONDecodeError, OSError, ValueError) as e: + log("SYSTEM", f"⚠️ Failed to load JSON from {path}: {e}") + return None def create_default_personas(): # Persona data is now stored in the Persona DB table; ensure the directory exists for sample files. @@ -153,11 +169,13 @@ def log_image_attempt(folder, img_type, prompt, filename, status, error=None, sc data = [] if os.path.exists(log_path): try: - with open(log_path, 'r') as f: data = json.load(f) - except: - pass + with open(log_path, 'r', encoding='utf-8') as f: + data = json.load(f) + except (json.JSONDecodeError, OSError): + data = [] # Corrupted log — start fresh rather than crash data.append(entry) - with open(log_path, 'w') as f: json.dump(data, f, indent=2) + with open(log_path, 'w', encoding='utf-8') as f: + json.dump(data, f, indent=2) def get_run_folder(base_name): if not os.path.exists(base_name): os.makedirs(base_name) @@ -218,9 +236,10 @@ def log_usage(folder, model_label, usage_metadata=None, image_count=0): if usage_metadata: try: - input_tokens = usage_metadata.prompt_token_count - output_tokens = usage_metadata.candidates_token_count - except: pass + input_tokens = usage_metadata.prompt_token_count or 0 + output_tokens = usage_metadata.candidates_token_count or 0 + except AttributeError: + pass # usage_metadata shape varies by model; tokens stay 0 cost = calculate_cost(model_label, input_tokens, output_tokens, image_count) diff --git a/marketing/blurb.py b/marketing/blurb.py index 0787167..3640c0e 100644 --- a/marketing/blurb.py +++ b/marketing/blurb.py @@ -44,8 +44,24 @@ def generate_blurb(bp, folder): try: response = ai_models.model_writer.generate_content(prompt) utils.log_usage(folder, ai_models.model_writer.name, response.usage_metadata) - blurb = response.text - with open(os.path.join(folder, "blurb.txt"), "w") as f: f.write(blurb) - with open(os.path.join(folder, "back_cover.txt"), "w") as f: f.write(blurb) - except: - utils.log("MARKETING", "Failed to generate blurb.") + blurb = response.text.strip() + + # Trim to 220 words if model overshot the 150-200 word target + words = blurb.split() + if len(words) > 220: + blurb = " ".join(words[:220]) + # End at the last sentence boundary within those 220 words + for end_ch in ['.', '!', '?']: + last_sent = blurb.rfind(end_ch) + if last_sent > len(blurb) // 2: + blurb = blurb[:last_sent + 1] + break + utils.log("MARKETING", f" -> Blurb trimmed to {len(blurb.split())} words.") + + with open(os.path.join(folder, "blurb.txt"), "w", encoding='utf-8') as f: + f.write(blurb) + with open(os.path.join(folder, "back_cover.txt"), "w", encoding='utf-8') as f: + f.write(blurb) + utils.log("MARKETING", f" -> Blurb: {len(blurb.split())} words.") + except Exception as e: + utils.log("MARKETING", f"Failed to generate blurb: {e}") diff --git a/marketing/fonts.py b/marketing/fonts.py index cc7e983..6fc0b71 100644 --- a/marketing/fonts.py +++ b/marketing/fonts.py @@ -42,14 +42,20 @@ def download_font(font_name): base_url = f"https://github.com/google/fonts/raw/main/{license_type}/{clean_name}" for pattern in patterns: try: - r = requests.get(f"{base_url}/{pattern}", headers=headers, timeout=5) + r = requests.get(f"{base_url}/{pattern}", headers=headers, timeout=6) if r.status_code == 200 and len(r.content) > 1000: - with open(font_path, 'wb') as f: f.write(r.content) + with open(font_path, 'wb') as f: + f.write(r.content) utils.log("ASSETS", f"✅ Downloaded {font_name} to {font_path}") return font_path - except Exception: continue + except requests.exceptions.Timeout: + utils.log("ASSETS", f" Font download timeout for {font_name} ({pattern}). Trying next...") + continue + except Exception: + continue if clean_name != "roboto": - utils.log("ASSETS", f"⚠️ Font '{font_name}' not found. Falling back to Roboto.") + utils.log("ASSETS", f"⚠️ Font '{font_name}' not found on Google Fonts. Falling back to Roboto.") return download_font("Roboto") + utils.log("ASSETS", "⚠️ Roboto fallback also failed. PIL will use built-in default font.") return None diff --git a/story/bible_tracker.py b/story/bible_tracker.py index b0d4736..1ff47ce 100644 --- a/story/bible_tracker.py +++ b/story/bible_tracker.py @@ -19,7 +19,11 @@ def merge_selected_changes(original, draft, selected_keys): original['project_metadata'][field] = draft['project_metadata'][field] elif parts[0] == 'char' and len(parts) >= 2: - idx = int(parts[1]) + try: + idx = int(parts[1]) + except (ValueError, IndexError): + utils.log("SYSTEM", f"⚠️ Skipping malformed bible merge key: '{key}'") + continue if idx < len(draft['characters']): if idx < len(original['characters']): original['characters'][idx] = draft['characters'][idx] @@ -27,7 +31,11 @@ def merge_selected_changes(original, draft, selected_keys): original['characters'].append(draft['characters'][idx]) elif parts[0] == 'book' and len(parts) >= 2: - book_num = int(parts[1]) + try: + book_num = int(parts[1]) + except (ValueError, IndexError): + utils.log("SYSTEM", f"⚠️ Skipping malformed bible merge key: '{key}'") + continue orig_book = next((b for b in original['books'] if b['book_number'] == book_num), None) draft_book = next((b for b in draft['books'] if b['book_number'] == book_num), None) @@ -42,7 +50,11 @@ def merge_selected_changes(original, draft, selected_keys): orig_book['manual_instruction'] = draft_book['manual_instruction'] elif len(parts) == 4 and parts[2] == 'beat': - beat_idx = int(parts[3]) + try: + beat_idx = int(parts[3]) + except (ValueError, IndexError): + utils.log("SYSTEM", f"⚠️ Skipping malformed beat merge key: '{key}'") + continue if beat_idx < len(draft_book['plot_beats']): while len(orig_book['plot_beats']) <= beat_idx: orig_book['plot_beats'].append("") @@ -153,7 +165,8 @@ def harvest_metadata(bp, folder, full_manuscript): if valid_chars: utils.log("HARVESTER", f"Found {len(valid_chars)} new chars.") bp['characters'].extend(valid_chars) - except: pass + except Exception as e: + utils.log("HARVESTER", f"⚠️ Metadata harvest failed: {e}") return bp diff --git a/story/writer.py b/story/writer.py index 8f60969..24c39be 100644 --- a/story/writer.py +++ b/story/writer.py @@ -362,12 +362,18 @@ def write_chapter(chap, bp, folder, prev_sum, tracking=None, prev_content=None, utils.log("WRITER", f"⚠️ Failed Ch {chap['chapter_number']}: {e}") return f"## Chapter {chap['chapter_number']} Failed\n\nError: {e}" - # Exp 7: Two-Pass Drafting — Polish the rough draft with the logic (Pro) model - # before evaluation. Produces cleaner prose with fewer rewrite cycles. - if current_text: - utils.log("WRITER", f" -> Two-pass polish (Pro model)...") - guidelines = get_style_guidelines() - fw_list = '", "'.join(guidelines['filter_words']) + # Exp 7: Two-Pass Drafting — Polish rough draft with the logic (Pro) model before evaluation. + # Skip when local filter-word heuristic shows draft is already clean (saves ~8K tokens/chapter). + _guidelines_for_polish = get_style_guidelines() + _fw_set = set(_guidelines_for_polish['filter_words']) + _draft_word_list = current_text.lower().split() if current_text else [] + _fw_hit_count = sum(1 for w in _draft_word_list if w in _fw_set) + _fw_density = _fw_hit_count / max(len(_draft_word_list), 1) + _skip_polish = _fw_density < 0.012 # < ~1 filter word per 83 words → draft already clean + + if current_text and not _skip_polish: + utils.log("WRITER", f" -> Two-pass polish (Pro model, FW density {_fw_density:.3f})...") + fw_list = '", "'.join(_guidelines_for_polish['filter_words']) polish_prompt = f""" ROLE: Senior Fiction Editor TASK: Polish this rough draft into publication-ready prose. @@ -379,6 +385,9 @@ def write_chapter(chap, bp, folder, prev_sum, tracking=None, prev_content=None, TARGET_WORDS: ~{est_words} BEATS (must all be covered): {json.dumps(chap.get('beats', []))} + CONTINUITY (maintain seamless flow from previous chapter): + {prev_context_block if prev_context_block else "First chapter — no prior context."} + POLISH_CHECKLIST: 1. FILTER_REMOVAL: Remove all filter words [{fw_list}] — rewrite each to show the sensation directly. 2. DEEP_POV: Ensure the reader is inside the POV character's experience at all times — no external narration. @@ -404,6 +413,8 @@ def write_chapter(chap, bp, folder, prev_sum, tracking=None, prev_content=None, current_text = polished except Exception as e: utils.log("WRITER", f" -> Polish pass failed: {e}. Proceeding with raw draft.") + elif current_text: + utils.log("WRITER", f" -> Draft clean (FW density {_fw_density:.3f}). Skipping polish pass.") # Reduced from 3 → 2 attempts since polish pass already refines prose before evaluation max_attempts = 2