"""eval_logger.py — Per-chapter evaluation log and HTML report generator. Writes a structured eval_log.json to the book folder during writing, then generates a self-contained HTML report that can be downloaded and shared with critics / prompt engineers to analyse quality patterns across a run. """ import json import os import time from core import utils # --------------------------------------------------------------------------- # Log writer # --------------------------------------------------------------------------- def append_eval_entry(folder, entry): """Append one chapter's evaluation record to eval_log.json. Called from story/writer.py at every return point in write_chapter(). Each entry captures the chapter metadata, polish decision, per-attempt scores/critiques/decisions, and the final accepted score. """ log_path = os.path.join(folder, "eval_log.json") data = [] if os.path.exists(log_path): try: with open(log_path, 'r', encoding='utf-8') as f: data = json.load(f) if not isinstance(data, list): data = [] except Exception: data = [] data.append(entry) try: with open(log_path, 'w', encoding='utf-8') as f: json.dump(data, f, indent=2) except Exception as e: utils.log("EVAL", f"Failed to write eval log: {e}") # --------------------------------------------------------------------------- # Report generation # --------------------------------------------------------------------------- def generate_html_report(folder, bp=None): """Generate a self-contained HTML evaluation report from eval_log.json. Returns the HTML string, or None if no log file exists / is empty. """ log_path = os.path.join(folder, "eval_log.json") if not os.path.exists(log_path): return None try: with open(log_path, 'r', encoding='utf-8') as f: chapters = json.load(f) except Exception: return None if not isinstance(chapters, list) or not chapters: return None title, genre = "Unknown Book", "Fiction" if bp: meta = bp.get('book_metadata', {}) title = meta.get('title', title) genre = meta.get('genre', genre) # --- Summary stats --- scores = [c.get('final_score', 0) for c in chapters if isinstance(c.get('final_score'), (int, float)) and c.get('final_score', 0) > 0] avg_score = round(sum(scores) / len(scores), 2) if scores else 0 total = len(chapters) auto_accepted = sum(1 for c in chapters if c.get('final_decision') == 'auto_accepted') multi_attempt = sum(1 for c in chapters if len(c.get('attempts', [])) > 1) full_rewrites = sum(1 for c in chapters for a in c.get('attempts', []) if a.get('decision') == 'full_rewrite') below_threshold = sum(1 for c in chapters if c.get('final_decision') == 'below_threshold') polish_applied = sum(1 for c in chapters if c.get('polish_applied')) score_dist = {i: 0 for i in range(1, 11)} for c in chapters: s = c.get('final_score', 0) if isinstance(s, int) and 1 <= s <= 10: score_dist[s] += 1 patterns = _mine_critique_patterns(chapters, total) report_date = time.strftime('%Y-%m-%d %H:%M') return _build_html(title, genre, report_date, chapters, avg_score, total, auto_accepted, multi_attempt, full_rewrites, below_threshold, polish_applied, score_dist, patterns) # --------------------------------------------------------------------------- # Pattern mining # --------------------------------------------------------------------------- def _mine_critique_patterns(chapters, total): pattern_keywords = { "Filter words (felt/saw/noticed)": ["filter word", "filter", "felt ", "noticed ", "realized ", "saw the", "heard the"], "Summary mode / telling": ["summary mode", "summariz", "telling", "show don't tell", "show, don't tell", "instead of dramatiz"], "Emotion labeling": ["emotion label", "told the reader", "labeling", "labelling", "she felt", "he felt", "was nervous", "was angry", "was sad"], "Deep POV issues": ["deep pov", "deep point of view", "distant narration", "remove the reader", "external narration"], "Pacing problems": ["pacing", "rushing", "too fast", "too slow", "dragging", "sagging", "abrupt"], "Dialogue too on-the-nose": ["on-the-nose", "on the nose", "subtext", "exposition dump", "characters explain"], "Weak chapter hook / ending": ["hook", "cliffhanger", "cut off abruptly", "anticlimax", "ending falls flat", "no tension"], "Passive voice / weak verbs": ["passive voice", "was [v", "were [v", "weak verb", "adverb"], "AI-isms / clichés": ["ai-ism", "cliché", "tapestry", "palpable", "testament", "azure", "cerulean", "bustling"], "Voice / tone inconsistency": ["voice", "tone inconsist", "persona", "shift in tone", "register"], "Missing sensory / atmosphere": ["sensory", "grounding", "atmosphere", "immersiv", "white room"], } counts = {} for pattern, keywords in pattern_keywords.items(): matching = [] for c in chapters: critique_blob = " ".join( a.get('critique', '').lower() for a in c.get('attempts', []) ) if any(kw.lower() in critique_blob for kw in keywords): matching.append(c.get('chapter_num', '?')) counts[pattern] = {'count': len(matching), 'chapters': matching} return dict(sorted(counts.items(), key=lambda x: x[1]['count'], reverse=True)) # --------------------------------------------------------------------------- # HTML builder # --------------------------------------------------------------------------- def _score_color(s): try: s = float(s) except (TypeError, ValueError): return '#6c757d' if s >= 8: return '#28a745' if s >= 7: return '#20c997' if s >= 6: return '#ffc107' return '#dc3545' def _decision_badge(d): MAP = { 'auto_accepted': ('⚡ Auto-Accept', '#28a745'), 'accepted': ('✓ Accepted', '#17a2b8'), 'accepted_at_max': ('✓ Accepted', '#17a2b8'), 'below_threshold': ('⚠ Below Threshold', '#dc3545'), 'below_threshold_accepted': ('⚠ Below Threshold', '#dc3545'), 'full_rewrite': ('🔄 Full Rewrite', '#6f42c1'), 'full_rewrite_failed': ('🔄✗ Rewrite Failed','#6f42c1'), 'refinement': ('✏ Refined', '#fd7e14'), 'refinement_failed': ('✏✗ Refine Failed', '#fd7e14'), 'eval_error': ('⚠ Eval Error', '#6c757d'), } label, color = MAP.get(d, (d or '?', '#6c757d')) return f'{label}' def _safe_int_fmt(v): try: return f"{int(v):,}" except (TypeError, ValueError): return str(v) if v else '?' def _build_html(title, genre, report_date, chapters, avg_score, total, auto_accepted, multi_attempt, full_rewrites, below_threshold, polish_applied, score_dist, patterns): avg_color = _score_color(avg_score) # --- Score timeline --- MAX_BAR = 260 timeline_rows = '' for c in chapters: s = c.get('final_score', 0) color = _score_color(s) width = max(2, int((s / 10) * MAX_BAR)) if s else 2 ch_num = c.get('chapter_num', '?') ch_title = str(c.get('title', ''))[:35] timeline_rows += ( f'
' f'
Ch {ch_num}
' f'
' f'
{s}/10 — {ch_title}
' f'
' ) # --- Score distribution --- max_dist = max(score_dist.values()) if any(score_dist.values()) else 1 dist_rows = '' for sv in range(10, 0, -1): count = score_dist.get(sv, 0) w = max(2, int((count / max_dist) * 200)) if count else 0 color = _score_color(sv) dist_rows += ( f'
' f'
{sv}
' f'
' f'
{count} ch{"apters" if count != 1 else "apter"}
' f'
' ) # --- Chapter rows --- chapter_rows = '' for c in chapters: cid = c.get('chapter_num', 0) ch_title = str(c.get('title', '')).replace('<', '<').replace('>', '>') pov = str(c.get('pov_character') or '—') pace = str(c.get('pacing') or '—') target_w = _safe_int_fmt(c.get('target_words')) actual_w = _safe_int_fmt(c.get('actual_words')) pos = c.get('chapter_position') pos_pct = f"{int(pos * 100)}%" if pos is not None else '—' threshold = c.get('score_threshold', '?') fw_dens = c.get('filter_word_density', 0) polish = '✓' if c.get('polish_applied') else '✗' polish_c = '#28a745' if c.get('polish_applied') else '#aaa' fs = c.get('final_score', 0) fd = c.get('final_decision', '') attempts = c.get('attempts', []) n_att = len(attempts) fs_color = _score_color(fs) fd_badge = _decision_badge(fd) # Attempt detail sub-rows att_rows = '' for att in attempts: an = att.get('n', '?') ascr = att.get('score', '?') adec = att.get('decision', '') acrit = str(att.get('critique', 'No critique.')).replace('&', '&').replace('<', '<').replace('>', '>') ac = _score_color(ascr) abadge = _decision_badge(adec) att_rows += ( f'' f'' f'
Attempt {an}:' f'{ascr}/10' f'{abadge}
' f'
{acrit}
' f'' ) chapter_rows += ( f'' f'{cid}' f'{ch_title}' f'{pov}' f'{pace}' f'{actual_w} /{target_w}' f'{pos_pct}' f'{threshold}' f'{polish} {fw_dens:.3f}' f'{fs}' f'{n_att}×' f'{fd_badge}' f'' f'{att_rows}' ) # --- Critique patterns --- pat_rows = '' for pattern, data in patterns.items(): count = data['count'] if count == 0: continue pct = int(count / total * 100) if total else 0 sev_color = '#dc3545' if pct >= 50 else '#fd7e14' if pct >= 30 else '#17a2b8' chlist = ', '.join(f'Ch {x}' for x in data['chapters'][:10]) if len(data['chapters']) > 10: chlist += f' (+{len(data["chapters"]) - 10} more)' pat_rows += ( f'' f'{pattern}' f'{count}/{total} ({pct}%)' f'{chlist}' f'' ) if not pat_rows: pat_rows = 'No significant patterns detected.' # --- Prompt tuning notes --- notes = _generate_prompt_notes(chapters, avg_score, total, full_rewrites, below_threshold, patterns) notes_html = ''.join(f'
  • {n}
  • ' for n in notes) return f''' Eval Report — {title}

    BookApp — Evaluation Report

    {title}

    Genre: {genre}  |  Generated: {report_date}  |  {total} chapter{"s" if total != 1 else ""}

    {avg_score}
    Avg Score /10
    {auto_accepted}
    Auto-Accepted (8+)
    {multi_attempt}
    Multi-Attempt
    {full_rewrites}
    Full Rewrites
    {below_threshold}
    Below Threshold
    {polish_applied}
    Polish Passes

    📊 Score Timeline

    8–10 Great 7–7.9 Good 6–6.9 Passable <6 Fail
    {timeline_rows}

    📈 Score Distribution

    {dist_rows}

    📋 Chapter Breakdown  (click any row to expand critiques)

    {chapter_rows}
    #TitlePOVPacing Words Pos% Threshold Polish / FW Score Att. Decision

    🔍 Critique Patterns  Keyword frequency across all evaluation critiques — high % = prompt gap

    {pat_rows}
    Issue PatternFrequencyAffected Chapters

    💡 Prompt Tuning Observations

    ''' # --------------------------------------------------------------------------- # Auto-observations for prompt tuning # --------------------------------------------------------------------------- def _generate_prompt_notes(chapters, avg_score, total, full_rewrites, below_threshold, patterns): notes = [] # Overall score if avg_score >= 8: notes.append(f"✅ High average score ({avg_score}/10). The generation pipeline is performing well. Focus on the few outlier chapters below the threshold.") elif avg_score >= 7: notes.append(f"✓ Solid average score ({avg_score}/10). Minor prompt reinforcement should push this above 8. Focus on the most common critique pattern.") elif avg_score >= 6: notes.append(f"⚠ Average score of {avg_score}/10 is below target. Strengthen the draft prompt's Deep POV mandate and filter-word removal rules.") else: notes.append(f"🚨 Low average score ({avg_score}/10). The core writing prompt needs significant work — review the Deep POV mandate, genre mandates, and consider adding concrete negative examples.") # Full rewrite rate if total > 0: rw_pct = int(full_rewrites / total * 100) if rw_pct > 30: notes.append(f"🔄 High full-rewrite rate ({rw_pct}%, {full_rewrites} triggers). The initial draft prompt produces too many sub-6 drafts. Add stronger examples or tighten the DEEP_POV_MANDATE and PROSE_RULES sections.") elif rw_pct > 15: notes.append(f"↩ Moderate full-rewrite rate ({rw_pct}%, {full_rewrites} triggers). The draft quality could be improved. Check the genre mandates for the types of chapters that rewrite most often.") # Below threshold if below_threshold > 0: bt_pct = int(below_threshold / total * 100) notes.append(f"⚠ {below_threshold} chapter{'s' if below_threshold != 1 else ''} ({bt_pct}%) finished below the quality threshold. Inspect the individual critiques to see if these cluster by POV, pacing, or story position.") # Top critique patterns for pattern, data in list(patterns.items())[:5]: pct = int(data['count'] / total * 100) if total else 0 if pct >= 50: notes.append(f"🔴 '{pattern}' appears in {pct}% of critiques. This is systemic — the current prompt does not prevent it. Add an explicit enforcement instruction with a concrete example of the wrong pattern and the correct alternative.") elif pct >= 30: notes.append(f"🟡 '{pattern}' mentioned in {pct}% of critiques. Consider reinforcing the relevant prompt instruction with a stronger negative example.") # Climax vs. early chapter comparison high_scores = [c.get('final_score', 0) for c in chapters if isinstance(c.get('chapter_position'), float) and c['chapter_position'] >= 0.75] low_scores = [c.get('final_score', 0) for c in chapters if isinstance(c.get('chapter_position'), float) and c['chapter_position'] < 0.25] if high_scores and low_scores: avg_climax = round(sum(high_scores) / len(high_scores), 1) avg_early = round(sum(low_scores) / len(low_scores), 1) if avg_climax < avg_early - 0.5: notes.append(f"📅 Climax chapters average {avg_climax}/10 vs early chapters {avg_early}/10. The high-stakes scenes underperform. Strengthen the genre mandates for climax pacing and consider adding specific instructions for emotional payoff.") elif avg_climax > avg_early + 0.5: notes.append(f"📅 Climax chapters outperform early chapters ({avg_climax} vs {avg_early}). Good — the adaptive threshold and extra attempts are concentrating quality where it matters.") # POV character analysis pov_scores = {} for c in chapters: pov = c.get('pov_character') or 'Unknown' s = c.get('final_score', 0) if s > 0: pov_scores.setdefault(pov, []).append(s) for pov, sc in sorted(pov_scores.items(), key=lambda x: sum(x[1]) / len(x[1])): if len(sc) >= 2 and sum(sc) / len(sc) < 6.5: avg_pov = round(sum(sc) / len(sc), 1) notes.append(f"👤 POV '{pov}' averages {avg_pov}/10. Consider adding or strengthening a character voice profile for this character, or refining the persona bio to match how this POV character should speak and think.") # Pacing analysis pace_scores = {} for c in chapters: pace = c.get('pacing', 'Standard') s = c.get('final_score', 0) if s > 0: pace_scores.setdefault(pace, []).append(s) for pace, sc in pace_scores.items(): if len(sc) >= 3 and sum(sc) / len(sc) < 6.5: avg_p = round(sum(sc) / len(sc), 1) notes.append(f"⏩ '{pace}' pacing chapters average {avg_p}/10. The writing model struggles with this rhythm. Revisit the PACING_GUIDE instructions for '{pace}' chapters — they may need more concrete direction.") if not notes: notes.append("No significant patterns detected. Review the individual chapter critiques for targeted improvements.") return notes