feat: Add evaluation report pipeline for prompt tuning feedback

Adds a full per-chapter evaluation logging system that captures every score, critique, and quality decision made during writing, then renders a self-contained HTML report shareable with critics or prompt engineers. New file — story/eval_logger.py: - append_eval_entry(folder, entry): writes per-chapter eval data to eval_log.json in the book folder (called from write_chapter() at every return point). - generate_html_report(folder, bp): reads eval_log.json and produces a self-contained HTML file (no external deps) with: • Summary cards (avg score, auto-accepted, rewrites, below-threshold) • Score timeline bar chart (one bar per chapter, colour-coded) • Score distribution histogram • Chapter breakdown table with expand-on-click critique details (attempt number, score, decision badge, full critique text) • Critique pattern frequency table (keyword mining across all critiques) • Auto-generated prompt tuning observations (systemic issues, POV character weak spots, pacing type analysis, climax vs. early chapter comparison) story/writer.py: - Imports time and eval_logger. - Initialises _eval_entry dict (chapter metadata + polish flags + thresholds) after all threshold variables are set. - Records each evaluation attempt's score, critique (truncated to 700 chars), and decision (auto_accepted / full_rewrite / refinement / accepted / below_threshold / eval_error / refinement_failed) before every return. web/routes/run.py: - Imports story_eval_logger. - New route GET /project/<run_id>/eval_report/<book_folder>: loads eval_log.json, calls generate_html_report(), returns the HTML as a downloadable attachment named eval_report_<title>.html. Returns a user-friendly "not yet available" page if no log exists. templates/run_details.html: - Adds "Eval Report" (btn-outline-info) button next to "Check Consistency" in each book's artifact section. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-02-24 08:03:32 -05:00
parent d2c65f010a
commit f869700070
4 changed files with 578 additions and 1 deletions
--- a/story/eval_logger.py
+++ b/story/eval_logger.py
@@ -0,0 +1,473 @@
+"""eval_logger.py — Per-chapter evaluation log and HTML report generator.
+
+Writes a structured eval_log.json to the book folder during writing, then
+generates a self-contained HTML report that can be downloaded and shared with
+critics / prompt engineers to analyse quality patterns across a run.
+"""
+
+import json
+import os
+import time
+from core import utils
+
+
+# ---------------------------------------------------------------------------
+# Log writer
+# ---------------------------------------------------------------------------
+
+def append_eval_entry(folder, entry):
+    """Append one chapter's evaluation record to eval_log.json.
+
+    Called from story/writer.py at every return point in write_chapter().
+    Each entry captures the chapter metadata, polish decision, per-attempt
+    scores/critiques/decisions, and the final accepted score.
+    """
+    log_path = os.path.join(folder, "eval_log.json")
+    data = []
+    if os.path.exists(log_path):
+        try:
+            with open(log_path, 'r', encoding='utf-8') as f:
+                data = json.load(f)
+            if not isinstance(data, list):
+                data = []
+        except Exception:
+            data = []
+    data.append(entry)
+    try:
+        with open(log_path, 'w', encoding='utf-8') as f:
+            json.dump(data, f, indent=2)
+    except Exception as e:
+        utils.log("EVAL", f"Failed to write eval log: {e}")
+
+
+# ---------------------------------------------------------------------------
+# Report generation
+# ---------------------------------------------------------------------------
+
+def generate_html_report(folder, bp=None):
+    """Generate a self-contained HTML evaluation report from eval_log.json.
+
+    Returns the HTML string, or None if no log file exists / is empty.
+    """
+    log_path = os.path.join(folder, "eval_log.json")
+    if not os.path.exists(log_path):
+        return None
+    try:
+        with open(log_path, 'r', encoding='utf-8') as f:
+            chapters = json.load(f)
+    except Exception:
+        return None
+
+    if not isinstance(chapters, list) or not chapters:
+        return None
+
+    title, genre = "Unknown Book", "Fiction"
+    if bp:
+        meta = bp.get('book_metadata', {})
+        title = meta.get('title', title)
+        genre = meta.get('genre', genre)
+
+    # --- Summary stats ---
+    scores = [c.get('final_score', 0) for c in chapters if isinstance(c.get('final_score'), (int, float)) and c.get('final_score', 0) > 0]
+    avg_score = round(sum(scores) / len(scores), 2) if scores else 0
+    total = len(chapters)
+    auto_accepted   = sum(1 for c in chapters if c.get('final_decision') == 'auto_accepted')
+    multi_attempt   = sum(1 for c in chapters if len(c.get('attempts', [])) > 1)
+    full_rewrites   = sum(1 for c in chapters for a in c.get('attempts', []) if a.get('decision') == 'full_rewrite')
+    below_threshold = sum(1 for c in chapters if c.get('final_decision') == 'below_threshold')
+    polish_applied  = sum(1 for c in chapters if c.get('polish_applied'))
+
+    score_dist = {i: 0 for i in range(1, 11)}
+    for c in chapters:
+        s = c.get('final_score', 0)
+        if isinstance(s, int) and 1 <= s <= 10:
+            score_dist[s] += 1
+
+    patterns = _mine_critique_patterns(chapters, total)
+    report_date = time.strftime('%Y-%m-%d %H:%M')
+    return _build_html(title, genre, report_date, chapters, avg_score, total,
+                       auto_accepted, multi_attempt, full_rewrites, below_threshold,
+                       polish_applied, score_dist, patterns)
+
+
+# ---------------------------------------------------------------------------
+# Pattern mining
+# ---------------------------------------------------------------------------
+
+def _mine_critique_patterns(chapters, total):
+    pattern_keywords = {
+        "Filter words (felt/saw/noticed)":    ["filter word", "filter", "felt ", "noticed ", "realized ", "saw the", "heard the"],
+        "Summary mode / telling":             ["summary mode", "summariz", "telling", "show don't tell", "show, don't tell", "instead of dramatiz"],
+        "Emotion labeling":                   ["emotion label", "told the reader", "labeling", "labelling", "she felt", "he felt", "was nervous", "was angry", "was sad"],
+        "Deep POV issues":                    ["deep pov", "deep point of view", "distant narration", "remove the reader", "external narration"],
+        "Pacing problems":                    ["pacing", "rushing", "too fast", "too slow", "dragging", "sagging", "abrupt"],
+        "Dialogue too on-the-nose":           ["on-the-nose", "on the nose", "subtext", "exposition dump", "characters explain"],
+        "Weak chapter hook / ending":         ["hook", "cliffhanger", "cut off abruptly", "anticlimax", "ending falls flat", "no tension"],
+        "Passive voice / weak verbs":         ["passive voice", "was [v", "were [v", "weak verb", "adverb"],
+        "AI-isms / clichés":                  ["ai-ism", "cliché", "tapestry", "palpable", "testament", "azure", "cerulean", "bustling"],
+        "Voice / tone inconsistency":         ["voice", "tone inconsist", "persona", "shift in tone", "register"],
+        "Missing sensory / atmosphere":       ["sensory", "grounding", "atmosphere", "immersiv", "white room"],
+    }
+    counts = {}
+    for pattern, keywords in pattern_keywords.items():
+        matching = []
+        for c in chapters:
+            critique_blob = " ".join(
+                a.get('critique', '').lower()
+                for a in c.get('attempts', [])
+            )
+            if any(kw.lower() in critique_blob for kw in keywords):
+                matching.append(c.get('chapter_num', '?'))
+        counts[pattern] = {'count': len(matching), 'chapters': matching}
+    return dict(sorted(counts.items(), key=lambda x: x[1]['count'], reverse=True))
+
+
+# ---------------------------------------------------------------------------
+# HTML builder
+# ---------------------------------------------------------------------------
+
+def _score_color(s):
+    try:
+        s = float(s)
+    except (TypeError, ValueError):
+        return '#6c757d'
+    if s >= 8:  return '#28a745'
+    if s >= 7:  return '#20c997'
+    if s >= 6:  return '#ffc107'
+    return '#dc3545'
+
+
+def _decision_badge(d):
+    MAP = {
+        'auto_accepted':          ('⚡ Auto-Accept',    '#28a745'),
+        'accepted':               ('✓ Accepted',        '#17a2b8'),
+        'accepted_at_max':        ('✓ Accepted',        '#17a2b8'),
+        'below_threshold':        ('⚠ Below Threshold', '#dc3545'),
+        'below_threshold_accepted': ('⚠ Below Threshold', '#dc3545'),
+        'full_rewrite':           ('🔄 Full Rewrite',   '#6f42c1'),
+        'full_rewrite_failed':    ('🔄✗ Rewrite Failed','#6f42c1'),
+        'refinement':             ('✏ Refined',         '#fd7e14'),
+        'refinement_failed':      ('✏✗ Refine Failed',  '#fd7e14'),
+        'eval_error':             ('⚠ Eval Error',      '#6c757d'),
+    }
+    label, color = MAP.get(d, (d or '?', '#6c757d'))
+    return f'<span style="background:{color};color:white;padding:2px 8px;border-radius:4px;font-size:0.78em">{label}</span>'
+
+
+def _safe_int_fmt(v):
+    try:
+        return f"{int(v):,}"
+    except (TypeError, ValueError):
+        return str(v) if v else '?'
+
+
+def _build_html(title, genre, report_date, chapters, avg_score, total,
+                auto_accepted, multi_attempt, full_rewrites, below_threshold,
+                polish_applied, score_dist, patterns):
+
+    avg_color = _score_color(avg_score)
+
+    # --- Score timeline ---
+    MAX_BAR = 260
+    timeline_rows = ''
+    for c in chapters:
+        s = c.get('final_score', 0)
+        color = _score_color(s)
+        width = max(2, int((s / 10) * MAX_BAR)) if s else 2
+        ch_num = c.get('chapter_num', '?')
+        ch_title = str(c.get('title', ''))[:35]
+        timeline_rows += (
+            f'<div style="display:flex;align-items:center;margin-bottom:4px;font-size:0.8em">'
+            f'<div style="width:45px;text-align:right;margin-right:8px;color:#888;flex-shrink:0">Ch {ch_num}</div>'
+            f'<div style="background:{color};height:16px;width:{width}px;border-radius:2px;flex-shrink:0"></div>'
+            f'<div style="margin-left:8px;color:#555">{s}/10 &mdash; {ch_title}</div>'
+            f'</div>'
+        )
+
+    # --- Score distribution ---
+    max_dist = max(score_dist.values()) if any(score_dist.values()) else 1
+    dist_rows = ''
+    for sv in range(10, 0, -1):
+        count = score_dist.get(sv, 0)
+        w = max(2, int((count / max_dist) * 200)) if count else 0
+        color = _score_color(sv)
+        dist_rows += (
+            f'<div style="display:flex;align-items:center;margin-bottom:4px;font-size:0.85em">'
+            f'<div style="width:28px;text-align:right;margin-right:8px;font-weight:bold;color:{color}">{sv}</div>'
+            f'<div style="background:{color};height:15px;width:{w}px;border-radius:2px;opacity:0.85"></div>'
+            f'<div style="margin-left:8px;color:#666">{count} ch{"apters" if count != 1 else "apter"}</div>'
+            f'</div>'
+        )
+
+    # --- Chapter rows ---
+    chapter_rows = ''
+    for c in chapters:
+        cid = c.get('chapter_num', 0)
+        ch_title  = str(c.get('title', '')).replace('<', '&lt;').replace('>', '&gt;')
+        pov       = str(c.get('pov_character') or '—')
+        pace      = str(c.get('pacing') or '—')
+        target_w  = _safe_int_fmt(c.get('target_words'))
+        actual_w  = _safe_int_fmt(c.get('actual_words'))
+        pos       = c.get('chapter_position')
+        pos_pct   = f"{int(pos * 100)}%" if pos is not None else '—'
+        threshold = c.get('score_threshold', '?')
+        fw_dens   = c.get('filter_word_density', 0)
+        polish    = '✓' if c.get('polish_applied') else '✗'
+        polish_c  = '#28a745' if c.get('polish_applied') else '#aaa'
+        fs        = c.get('final_score', 0)
+        fd        = c.get('final_decision', '')
+        attempts  = c.get('attempts', [])
+        n_att     = len(attempts)
+        fs_color  = _score_color(fs)
+        fd_badge  = _decision_badge(fd)
+
+        # Attempt detail sub-rows
+        att_rows = ''
+        for att in attempts:
+            an    = att.get('n', '?')
+            ascr  = att.get('score', '?')
+            adec  = att.get('decision', '')
+            acrit = str(att.get('critique', 'No critique.')).replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;')
+            ac    = _score_color(ascr)
+            abadge = _decision_badge(adec)
+            att_rows += (
+                f'<tr style="background:#f6f8fa">'
+                f'<td colspan="11" style="padding:12px 16px 12px 56px;border-bottom:1px solid #e8eaed">'
+                f'<div style="margin-bottom:6px"><strong>Attempt {an}:</strong>'
+                f'<span style="font-size:1.1em;font-weight:bold;color:{ac};margin:0 8px">{ascr}/10</span>'
+                f'{abadge}</div>'
+                f'<div style="font-size:0.83em;color:#444;line-height:1.55;white-space:pre-wrap;'
+                f'background:#fff;padding:10px 12px;border-left:3px solid {ac};border-radius:2px;'
+                f'max-height:300px;overflow-y:auto">{acrit}</div>'
+                f'</td></tr>'
+            )
+
+        chapter_rows += (
+            f'<tr class="chrow" onclick="toggle({cid})" style="cursor:pointer">'
+            f'<td style="font-weight:700;text-align:center">{cid}</td>'
+            f'<td>{ch_title}</td>'
+            f'<td style="color:#666;font-size:0.85em">{pov}</td>'
+            f'<td style="color:#666;font-size:0.85em">{pace}</td>'
+            f'<td style="text-align:right">{actual_w} <span style="color:#aaa">/{target_w}</span></td>'
+            f'<td style="text-align:center;color:#888">{pos_pct}</td>'
+            f'<td style="text-align:center">{threshold}</td>'
+            f'<td style="text-align:center;color:{polish_c}">{polish} <span style="color:#aaa;font-size:0.8em">{fw_dens:.3f}</span></td>'
+            f'<td style="text-align:center;font-weight:700;font-size:1.1em;color:{fs_color}">{fs}</td>'
+            f'<td style="text-align:center;color:#888">{n_att}&times;</td>'
+            f'<td>{fd_badge}</td>'
+            f'</tr>'
+            f'<tr id="d{cid}" class="detrow">{att_rows}</tr>'
+        )
+
+    # --- Critique patterns ---
+    pat_rows = ''
+    for pattern, data in patterns.items():
+        count = data['count']
+        if count == 0:
+            continue
+        pct = int(count / total * 100) if total else 0
+        sev_color = '#dc3545' if pct >= 50 else '#fd7e14' if pct >= 30 else '#17a2b8'
+        chlist = ', '.join(f'Ch {x}' for x in data['chapters'][:10])
+        if len(data['chapters']) > 10:
+            chlist += f' (+{len(data["chapters"]) - 10} more)'
+        pat_rows += (
+            f'<tr>'
+            f'<td><strong>{pattern}</strong></td>'
+            f'<td style="text-align:center;color:{sev_color};font-weight:700">{count}/{total} ({pct}%)</td>'
+            f'<td style="color:#666;font-size:0.83em">{chlist}</td>'
+            f'</tr>'
+        )
+    if not pat_rows:
+        pat_rows = '<tr><td colspan="3" style="color:#666;text-align:center;padding:12px">No significant patterns detected.</td></tr>'
+
+    # --- Prompt tuning notes ---
+    notes = _generate_prompt_notes(chapters, avg_score, total, full_rewrites, below_threshold, patterns)
+    notes_html = ''.join(f'<li style="margin-bottom:8px;line-height:1.55">{n}</li>' for n in notes)
+
+    return f'''<!DOCTYPE html>
+<html lang="en">
+<head>
+<meta charset="UTF-8">
+<meta name="viewport" content="width=device-width, initial-scale=1">
+<title>Eval Report &mdash; {title}</title>
+<style>
+*{{box-sizing:border-box;margin:0;padding:0}}
+body{{font-family:-apple-system,BlinkMacSystemFont,"Segoe UI",Roboto,sans-serif;background:#f0f2f5;color:#333;padding:20px}}
+.wrap{{max-width:1280px;margin:0 auto}}
+header{{background:#1a1d23;color:#fff;padding:22px 28px;border-radius:10px;margin-bottom:22px}}
+header h1{{font-size:0.9em;color:#8b92a1;margin-bottom:4px;font-weight:500}}
+header h2{{font-size:1.9em;font-weight:700;margin-bottom:6px}}
+header p{{color:#8b92a1;font-size:0.88em}}
+.cards{{display:grid;grid-template-columns:repeat(auto-fit,minmax(130px,1fr));gap:12px;margin-bottom:20px}}
+.card{{background:#fff;border-radius:8px;padding:16px;text-align:center;box-shadow:0 1px 3px rgba(0,0,0,.08)}}
+.card .val{{font-size:2em;font-weight:700}}
+.card .lbl{{font-size:0.75em;color:#888;margin-top:4px;line-height:1.3}}
+.two-col{{display:grid;grid-template-columns:1fr 1fr;gap:16px;margin-bottom:16px}}
+section{{background:#fff;border-radius:8px;padding:20px;margin-bottom:16px;box-shadow:0 1px 3px rgba(0,0,0,.08)}}
+section h3{{font-size:1em;font-weight:700;border-bottom:2px solid #f0f0f0;padding-bottom:8px;margin-bottom:14px}}
+table{{width:100%;border-collapse:collapse;font-size:0.86em}}
+th{{background:#f7f8fa;padding:8px 10px;text-align:left;font-weight:600;color:#555;border-bottom:2px solid #e0e4ea;white-space:nowrap}}
+td{{padding:8px 10px;border-bottom:1px solid #f0f0f0;vertical-align:middle}}
+.chrow:hover{{background:#f7f8fa}}
+.detrow{{display:none}}
+.legend{{display:flex;gap:14px;flex-wrap:wrap;font-size:0.78em;color:#777;margin-bottom:10px}}
+.dot{{display:inline-block;width:11px;height:11px;border-radius:50%;vertical-align:middle;margin-right:3px}}
+ul.notes{{padding-left:20px}}
+@media(max-width:768px){{.two-col{{grid-template-columns:1fr}}}}
+</style>
+</head>
+<body>
+<div class="wrap">
+
+<header>
+  <h1>BookApp &mdash; Evaluation Report</h1>
+  <h2>{title}</h2>
+  <p>Genre: {genre}&nbsp;&nbsp;|&nbsp;&nbsp;Generated: {report_date}&nbsp;&nbsp;|&nbsp;&nbsp;{total} chapter{"s" if total != 1 else ""}</p>
+</header>
+
+<div class="cards">
+  <div class="card"><div class="val" style="color:{avg_color}">{avg_score}</div><div class="lbl">Avg Score /10</div></div>
+  <div class="card"><div class="val" style="color:#28a745">{auto_accepted}</div><div class="lbl">Auto-Accepted (8+)</div></div>
+  <div class="card"><div class="val" style="color:#17a2b8">{multi_attempt}</div><div class="lbl">Multi-Attempt</div></div>
+  <div class="card"><div class="val" style="color:#6f42c1">{full_rewrites}</div><div class="lbl">Full Rewrites</div></div>
+  <div class="card"><div class="val" style="color:#dc3545">{below_threshold}</div><div class="lbl">Below Threshold</div></div>
+  <div class="card"><div class="val" style="color:#fd7e14">{polish_applied}</div><div class="lbl">Polish Passes</div></div>
+</div>
+
+<div class="two-col">
+<section>
+  <h3>&#128202; Score Timeline</h3>
+  <div class="legend">
+    <span><span class="dot" style="background:#28a745"></span>8&ndash;10 Great</span>
+    <span><span class="dot" style="background:#20c997"></span>7&ndash;7.9 Good</span>
+    <span><span class="dot" style="background:#ffc107"></span>6&ndash;6.9 Passable</span>
+    <span><span class="dot" style="background:#dc3545"></span>&lt;6 Fail</span>
+  </div>
+  <div style="overflow-y:auto;max-height:420px;padding-right:4px">{timeline_rows}</div>
+</section>
+<section>
+  <h3>&#128200; Score Distribution</h3>
+  <div style="margin-top:8px">{dist_rows}</div>
+</section>
+</div>
+
+<section>
+  <h3>&#128203; Chapter Breakdown &nbsp;<small style="font-weight:400;color:#888">(click any row to expand critiques)</small></h3>
+  <div style="overflow-x:auto">
+  <table>
+    <thead><tr>
+      <th>#</th><th>Title</th><th>POV</th><th>Pacing</th>
+      <th style="text-align:right">Words</th>
+      <th style="text-align:center">Pos%</th>
+      <th style="text-align:center">Threshold</th>
+      <th style="text-align:center">Polish&nbsp;/&nbsp;FW</th>
+      <th style="text-align:center">Score</th>
+      <th style="text-align:center">Att.</th>
+      <th>Decision</th>
+    </tr></thead>
+    <tbody>{chapter_rows}</tbody>
+  </table>
+  </div>
+</section>
+
+<section>
+  <h3>&#128269; Critique Patterns &nbsp;<small style="font-weight:400;color:#888">Keyword frequency across all evaluation critiques &mdash; high % = prompt gap</small></h3>
+  <table>
+    <thead><tr><th>Issue Pattern</th><th style="text-align:center">Frequency</th><th>Affected Chapters</th></tr></thead>
+    <tbody>{pat_rows}</tbody>
+  </table>
+</section>
+
+<section>
+  <h3>&#128161; Prompt Tuning Observations</h3>
+  <ul class="notes">{notes_html}</ul>
+</section>
+
+</div>
+<script>
+function toggle(id){{
+  var r=document.getElementById('d'+id);
+  if(r) r.style.display=(r.style.display==='none'||r.style.display==='')?'table-row':'none';
+}}
+document.querySelectorAll('.detrow').forEach(function(r){{r.style.display='none';}});
+</script>
+</body>
+</html>'''
+
+
+# ---------------------------------------------------------------------------
+# Auto-observations for prompt tuning
+# ---------------------------------------------------------------------------
+
+def _generate_prompt_notes(chapters, avg_score, total, full_rewrites, below_threshold, patterns):
+    notes = []
+
+    # Overall score
+    if avg_score >= 8:
+        notes.append(f"&#9989; <strong>High average score ({avg_score}/10).</strong> The generation pipeline is performing well. Focus on the few outlier chapters below the threshold.")
+    elif avg_score >= 7:
+        notes.append(f"&#10003; <strong>Solid average score ({avg_score}/10).</strong> Minor prompt reinforcement should push this above 8. Focus on the most common critique pattern.")
+    elif avg_score >= 6:
+        notes.append(f"&#9888; <strong>Average score of {avg_score}/10 is below target.</strong> Strengthen the draft prompt's Deep POV mandate and filter-word removal rules.")
+    else:
+        notes.append(f"&#128680; <strong>Low average score ({avg_score}/10).</strong> The core writing prompt needs significant work &mdash; review the Deep POV mandate, genre mandates, and consider adding concrete negative examples.")
+
+    # Full rewrite rate
+    if total > 0:
+        rw_pct = int(full_rewrites / total * 100)
+        if rw_pct > 30:
+            notes.append(f"&#128260; <strong>High full-rewrite rate ({rw_pct}%, {full_rewrites} triggers).</strong> The initial draft prompt produces too many sub-6 drafts. Add stronger examples or tighten the DEEP_POV_MANDATE and PROSE_RULES sections.")
+        elif rw_pct > 15:
+            notes.append(f"&#8617; <strong>Moderate full-rewrite rate ({rw_pct}%, {full_rewrites} triggers).</strong> The draft quality could be improved. Check the genre mandates for the types of chapters that rewrite most often.")
+
+        # Below threshold
+        if below_threshold > 0:
+            bt_pct = int(below_threshold / total * 100)
+            notes.append(f"&#9888; <strong>{below_threshold} chapter{'s' if below_threshold != 1 else ''} ({bt_pct}%) finished below the quality threshold.</strong> Inspect the individual critiques to see if these cluster by POV, pacing, or story position.")
+
+    # Top critique patterns
+    for pattern, data in list(patterns.items())[:5]:
+        pct = int(data['count'] / total * 100) if total else 0
+        if pct >= 50:
+            notes.append(f"&#128308; <strong>'{pattern}' appears in {pct}% of critiques.</strong> This is systemic &mdash; the current prompt does not prevent it. Add an explicit enforcement instruction with a concrete example of the wrong pattern and the correct alternative.")
+        elif pct >= 30:
+            notes.append(f"&#128993; <strong>'{pattern}' mentioned in {pct}% of critiques.</strong> Consider reinforcing the relevant prompt instruction with a stronger negative example.")
+
+    # Climax vs. early chapter comparison
+    high_scores = [c.get('final_score', 0) for c in chapters if isinstance(c.get('chapter_position'), float) and c['chapter_position'] >= 0.75]
+    low_scores  = [c.get('final_score', 0) for c in chapters if isinstance(c.get('chapter_position'), float) and c['chapter_position'] < 0.25]
+    if high_scores and low_scores:
+        avg_climax = round(sum(high_scores) / len(high_scores), 1)
+        avg_early  = round(sum(low_scores)  / len(low_scores),  1)
+        if avg_climax < avg_early - 0.5:
+            notes.append(f"&#128197; <strong>Climax chapters average {avg_climax}/10 vs early chapters {avg_early}/10.</strong> The high-stakes scenes underperform. Strengthen the genre mandates for climax pacing and consider adding specific instructions for emotional payoff.")
+        elif avg_climax > avg_early + 0.5:
+            notes.append(f"&#128197; <strong>Climax chapters outperform early chapters ({avg_climax} vs {avg_early}).</strong> Good &mdash; the adaptive threshold and extra attempts are concentrating quality where it matters.")
+
+    # POV character analysis
+    pov_scores = {}
+    for c in chapters:
+        pov = c.get('pov_character') or 'Unknown'
+        s = c.get('final_score', 0)
+        if s > 0:
+            pov_scores.setdefault(pov, []).append(s)
+    for pov, sc in sorted(pov_scores.items(), key=lambda x: sum(x[1]) / len(x[1])):
+        if len(sc) >= 2 and sum(sc) / len(sc) < 6.5:
+            avg_pov = round(sum(sc) / len(sc), 1)
+            notes.append(f"&#128100; <strong>POV '{pov}' averages {avg_pov}/10.</strong> Consider adding or strengthening a character voice profile for this character, or refining the persona bio to match how this POV character should speak and think.")
+
+    # Pacing analysis
+    pace_scores = {}
+    for c in chapters:
+        pace = c.get('pacing', 'Standard')
+        s = c.get('final_score', 0)
+        if s > 0:
+            pace_scores.setdefault(pace, []).append(s)
+    for pace, sc in pace_scores.items():
+        if len(sc) >= 3 and sum(sc) / len(sc) < 6.5:
+            avg_p = round(sum(sc) / len(sc), 1)
+            notes.append(f"&#9193; <strong>'{pace}' pacing chapters average {avg_p}/10.</strong> The writing model struggles with this rhythm. Revisit the PACING_GUIDE instructions for '{pace}' chapters &mdash; they may need more concrete direction.")
+
+    if not notes:
+        notes.append("No significant patterns detected. Review the individual chapter critiques for targeted improvements.")
+    return notes