feat: Add evaluation report pipeline for prompt tuning feedback
Adds a full per-chapter evaluation logging system that captures every
score, critique, and quality decision made during writing, then renders
a self-contained HTML report shareable with critics or prompt engineers.
New file — story/eval_logger.py:
- append_eval_entry(folder, entry): writes per-chapter eval data to
eval_log.json in the book folder (called from write_chapter() at
every return point).
- generate_html_report(folder, bp): reads eval_log.json and produces a
self-contained HTML file (no external deps) with:
• Summary cards (avg score, auto-accepted, rewrites, below-threshold)
• Score timeline bar chart (one bar per chapter, colour-coded)
• Score distribution histogram
• Chapter breakdown table with expand-on-click critique details
(attempt number, score, decision badge, full critique text)
• Critique pattern frequency table (keyword mining across all critiques)
• Auto-generated prompt tuning observations (systemic issues, POV
character weak spots, pacing type analysis, climax vs. early
chapter comparison)
story/writer.py:
- Imports time and eval_logger.
- Initialises _eval_entry dict (chapter metadata + polish flags + thresholds)
after all threshold variables are set.
- Records each evaluation attempt's score, critique (truncated to 700 chars),
and decision (auto_accepted / full_rewrite / refinement / accepted /
below_threshold / eval_error / refinement_failed) before every return.
web/routes/run.py:
- Imports story_eval_logger.
- New route GET /project/<run_id>/eval_report/<book_folder>: loads
eval_log.json, calls generate_html_report(), returns the HTML as a
downloadable attachment named eval_report_<title>.html.
Returns a user-friendly "not yet available" page if no log exists.
templates/run_details.html:
- Adds "Eval Report" (btn-outline-info) button next to "Check Consistency"
in each book's artifact section.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
473
story/eval_logger.py
Normal file
473
story/eval_logger.py
Normal file
@@ -0,0 +1,473 @@
|
|||||||
|
"""eval_logger.py — Per-chapter evaluation log and HTML report generator.
|
||||||
|
|
||||||
|
Writes a structured eval_log.json to the book folder during writing, then
|
||||||
|
generates a self-contained HTML report that can be downloaded and shared with
|
||||||
|
critics / prompt engineers to analyse quality patterns across a run.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import time
|
||||||
|
from core import utils
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Log writer
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def append_eval_entry(folder, entry):
|
||||||
|
"""Append one chapter's evaluation record to eval_log.json.
|
||||||
|
|
||||||
|
Called from story/writer.py at every return point in write_chapter().
|
||||||
|
Each entry captures the chapter metadata, polish decision, per-attempt
|
||||||
|
scores/critiques/decisions, and the final accepted score.
|
||||||
|
"""
|
||||||
|
log_path = os.path.join(folder, "eval_log.json")
|
||||||
|
data = []
|
||||||
|
if os.path.exists(log_path):
|
||||||
|
try:
|
||||||
|
with open(log_path, 'r', encoding='utf-8') as f:
|
||||||
|
data = json.load(f)
|
||||||
|
if not isinstance(data, list):
|
||||||
|
data = []
|
||||||
|
except Exception:
|
||||||
|
data = []
|
||||||
|
data.append(entry)
|
||||||
|
try:
|
||||||
|
with open(log_path, 'w', encoding='utf-8') as f:
|
||||||
|
json.dump(data, f, indent=2)
|
||||||
|
except Exception as e:
|
||||||
|
utils.log("EVAL", f"Failed to write eval log: {e}")
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Report generation
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def generate_html_report(folder, bp=None):
|
||||||
|
"""Generate a self-contained HTML evaluation report from eval_log.json.
|
||||||
|
|
||||||
|
Returns the HTML string, or None if no log file exists / is empty.
|
||||||
|
"""
|
||||||
|
log_path = os.path.join(folder, "eval_log.json")
|
||||||
|
if not os.path.exists(log_path):
|
||||||
|
return None
|
||||||
|
try:
|
||||||
|
with open(log_path, 'r', encoding='utf-8') as f:
|
||||||
|
chapters = json.load(f)
|
||||||
|
except Exception:
|
||||||
|
return None
|
||||||
|
|
||||||
|
if not isinstance(chapters, list) or not chapters:
|
||||||
|
return None
|
||||||
|
|
||||||
|
title, genre = "Unknown Book", "Fiction"
|
||||||
|
if bp:
|
||||||
|
meta = bp.get('book_metadata', {})
|
||||||
|
title = meta.get('title', title)
|
||||||
|
genre = meta.get('genre', genre)
|
||||||
|
|
||||||
|
# --- Summary stats ---
|
||||||
|
scores = [c.get('final_score', 0) for c in chapters if isinstance(c.get('final_score'), (int, float)) and c.get('final_score', 0) > 0]
|
||||||
|
avg_score = round(sum(scores) / len(scores), 2) if scores else 0
|
||||||
|
total = len(chapters)
|
||||||
|
auto_accepted = sum(1 for c in chapters if c.get('final_decision') == 'auto_accepted')
|
||||||
|
multi_attempt = sum(1 for c in chapters if len(c.get('attempts', [])) > 1)
|
||||||
|
full_rewrites = sum(1 for c in chapters for a in c.get('attempts', []) if a.get('decision') == 'full_rewrite')
|
||||||
|
below_threshold = sum(1 for c in chapters if c.get('final_decision') == 'below_threshold')
|
||||||
|
polish_applied = sum(1 for c in chapters if c.get('polish_applied'))
|
||||||
|
|
||||||
|
score_dist = {i: 0 for i in range(1, 11)}
|
||||||
|
for c in chapters:
|
||||||
|
s = c.get('final_score', 0)
|
||||||
|
if isinstance(s, int) and 1 <= s <= 10:
|
||||||
|
score_dist[s] += 1
|
||||||
|
|
||||||
|
patterns = _mine_critique_patterns(chapters, total)
|
||||||
|
report_date = time.strftime('%Y-%m-%d %H:%M')
|
||||||
|
return _build_html(title, genre, report_date, chapters, avg_score, total,
|
||||||
|
auto_accepted, multi_attempt, full_rewrites, below_threshold,
|
||||||
|
polish_applied, score_dist, patterns)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Pattern mining
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def _mine_critique_patterns(chapters, total):
|
||||||
|
pattern_keywords = {
|
||||||
|
"Filter words (felt/saw/noticed)": ["filter word", "filter", "felt ", "noticed ", "realized ", "saw the", "heard the"],
|
||||||
|
"Summary mode / telling": ["summary mode", "summariz", "telling", "show don't tell", "show, don't tell", "instead of dramatiz"],
|
||||||
|
"Emotion labeling": ["emotion label", "told the reader", "labeling", "labelling", "she felt", "he felt", "was nervous", "was angry", "was sad"],
|
||||||
|
"Deep POV issues": ["deep pov", "deep point of view", "distant narration", "remove the reader", "external narration"],
|
||||||
|
"Pacing problems": ["pacing", "rushing", "too fast", "too slow", "dragging", "sagging", "abrupt"],
|
||||||
|
"Dialogue too on-the-nose": ["on-the-nose", "on the nose", "subtext", "exposition dump", "characters explain"],
|
||||||
|
"Weak chapter hook / ending": ["hook", "cliffhanger", "cut off abruptly", "anticlimax", "ending falls flat", "no tension"],
|
||||||
|
"Passive voice / weak verbs": ["passive voice", "was [v", "were [v", "weak verb", "adverb"],
|
||||||
|
"AI-isms / clichés": ["ai-ism", "cliché", "tapestry", "palpable", "testament", "azure", "cerulean", "bustling"],
|
||||||
|
"Voice / tone inconsistency": ["voice", "tone inconsist", "persona", "shift in tone", "register"],
|
||||||
|
"Missing sensory / atmosphere": ["sensory", "grounding", "atmosphere", "immersiv", "white room"],
|
||||||
|
}
|
||||||
|
counts = {}
|
||||||
|
for pattern, keywords in pattern_keywords.items():
|
||||||
|
matching = []
|
||||||
|
for c in chapters:
|
||||||
|
critique_blob = " ".join(
|
||||||
|
a.get('critique', '').lower()
|
||||||
|
for a in c.get('attempts', [])
|
||||||
|
)
|
||||||
|
if any(kw.lower() in critique_blob for kw in keywords):
|
||||||
|
matching.append(c.get('chapter_num', '?'))
|
||||||
|
counts[pattern] = {'count': len(matching), 'chapters': matching}
|
||||||
|
return dict(sorted(counts.items(), key=lambda x: x[1]['count'], reverse=True))
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# HTML builder
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def _score_color(s):
|
||||||
|
try:
|
||||||
|
s = float(s)
|
||||||
|
except (TypeError, ValueError):
|
||||||
|
return '#6c757d'
|
||||||
|
if s >= 8: return '#28a745'
|
||||||
|
if s >= 7: return '#20c997'
|
||||||
|
if s >= 6: return '#ffc107'
|
||||||
|
return '#dc3545'
|
||||||
|
|
||||||
|
|
||||||
|
def _decision_badge(d):
|
||||||
|
MAP = {
|
||||||
|
'auto_accepted': ('⚡ Auto-Accept', '#28a745'),
|
||||||
|
'accepted': ('✓ Accepted', '#17a2b8'),
|
||||||
|
'accepted_at_max': ('✓ Accepted', '#17a2b8'),
|
||||||
|
'below_threshold': ('⚠ Below Threshold', '#dc3545'),
|
||||||
|
'below_threshold_accepted': ('⚠ Below Threshold', '#dc3545'),
|
||||||
|
'full_rewrite': ('🔄 Full Rewrite', '#6f42c1'),
|
||||||
|
'full_rewrite_failed': ('🔄✗ Rewrite Failed','#6f42c1'),
|
||||||
|
'refinement': ('✏ Refined', '#fd7e14'),
|
||||||
|
'refinement_failed': ('✏✗ Refine Failed', '#fd7e14'),
|
||||||
|
'eval_error': ('⚠ Eval Error', '#6c757d'),
|
||||||
|
}
|
||||||
|
label, color = MAP.get(d, (d or '?', '#6c757d'))
|
||||||
|
return f'<span style="background:{color};color:white;padding:2px 8px;border-radius:4px;font-size:0.78em">{label}</span>'
|
||||||
|
|
||||||
|
|
||||||
|
def _safe_int_fmt(v):
|
||||||
|
try:
|
||||||
|
return f"{int(v):,}"
|
||||||
|
except (TypeError, ValueError):
|
||||||
|
return str(v) if v else '?'
|
||||||
|
|
||||||
|
|
||||||
|
def _build_html(title, genre, report_date, chapters, avg_score, total,
|
||||||
|
auto_accepted, multi_attempt, full_rewrites, below_threshold,
|
||||||
|
polish_applied, score_dist, patterns):
|
||||||
|
|
||||||
|
avg_color = _score_color(avg_score)
|
||||||
|
|
||||||
|
# --- Score timeline ---
|
||||||
|
MAX_BAR = 260
|
||||||
|
timeline_rows = ''
|
||||||
|
for c in chapters:
|
||||||
|
s = c.get('final_score', 0)
|
||||||
|
color = _score_color(s)
|
||||||
|
width = max(2, int((s / 10) * MAX_BAR)) if s else 2
|
||||||
|
ch_num = c.get('chapter_num', '?')
|
||||||
|
ch_title = str(c.get('title', ''))[:35]
|
||||||
|
timeline_rows += (
|
||||||
|
f'<div style="display:flex;align-items:center;margin-bottom:4px;font-size:0.8em">'
|
||||||
|
f'<div style="width:45px;text-align:right;margin-right:8px;color:#888;flex-shrink:0">Ch {ch_num}</div>'
|
||||||
|
f'<div style="background:{color};height:16px;width:{width}px;border-radius:2px;flex-shrink:0"></div>'
|
||||||
|
f'<div style="margin-left:8px;color:#555">{s}/10 — {ch_title}</div>'
|
||||||
|
f'</div>'
|
||||||
|
)
|
||||||
|
|
||||||
|
# --- Score distribution ---
|
||||||
|
max_dist = max(score_dist.values()) if any(score_dist.values()) else 1
|
||||||
|
dist_rows = ''
|
||||||
|
for sv in range(10, 0, -1):
|
||||||
|
count = score_dist.get(sv, 0)
|
||||||
|
w = max(2, int((count / max_dist) * 200)) if count else 0
|
||||||
|
color = _score_color(sv)
|
||||||
|
dist_rows += (
|
||||||
|
f'<div style="display:flex;align-items:center;margin-bottom:4px;font-size:0.85em">'
|
||||||
|
f'<div style="width:28px;text-align:right;margin-right:8px;font-weight:bold;color:{color}">{sv}</div>'
|
||||||
|
f'<div style="background:{color};height:15px;width:{w}px;border-radius:2px;opacity:0.85"></div>'
|
||||||
|
f'<div style="margin-left:8px;color:#666">{count} ch{"apters" if count != 1 else "apter"}</div>'
|
||||||
|
f'</div>'
|
||||||
|
)
|
||||||
|
|
||||||
|
# --- Chapter rows ---
|
||||||
|
chapter_rows = ''
|
||||||
|
for c in chapters:
|
||||||
|
cid = c.get('chapter_num', 0)
|
||||||
|
ch_title = str(c.get('title', '')).replace('<', '<').replace('>', '>')
|
||||||
|
pov = str(c.get('pov_character') or '—')
|
||||||
|
pace = str(c.get('pacing') or '—')
|
||||||
|
target_w = _safe_int_fmt(c.get('target_words'))
|
||||||
|
actual_w = _safe_int_fmt(c.get('actual_words'))
|
||||||
|
pos = c.get('chapter_position')
|
||||||
|
pos_pct = f"{int(pos * 100)}%" if pos is not None else '—'
|
||||||
|
threshold = c.get('score_threshold', '?')
|
||||||
|
fw_dens = c.get('filter_word_density', 0)
|
||||||
|
polish = '✓' if c.get('polish_applied') else '✗'
|
||||||
|
polish_c = '#28a745' if c.get('polish_applied') else '#aaa'
|
||||||
|
fs = c.get('final_score', 0)
|
||||||
|
fd = c.get('final_decision', '')
|
||||||
|
attempts = c.get('attempts', [])
|
||||||
|
n_att = len(attempts)
|
||||||
|
fs_color = _score_color(fs)
|
||||||
|
fd_badge = _decision_badge(fd)
|
||||||
|
|
||||||
|
# Attempt detail sub-rows
|
||||||
|
att_rows = ''
|
||||||
|
for att in attempts:
|
||||||
|
an = att.get('n', '?')
|
||||||
|
ascr = att.get('score', '?')
|
||||||
|
adec = att.get('decision', '')
|
||||||
|
acrit = str(att.get('critique', 'No critique.')).replace('&', '&').replace('<', '<').replace('>', '>')
|
||||||
|
ac = _score_color(ascr)
|
||||||
|
abadge = _decision_badge(adec)
|
||||||
|
att_rows += (
|
||||||
|
f'<tr style="background:#f6f8fa">'
|
||||||
|
f'<td colspan="11" style="padding:12px 16px 12px 56px;border-bottom:1px solid #e8eaed">'
|
||||||
|
f'<div style="margin-bottom:6px"><strong>Attempt {an}:</strong>'
|
||||||
|
f'<span style="font-size:1.1em;font-weight:bold;color:{ac};margin:0 8px">{ascr}/10</span>'
|
||||||
|
f'{abadge}</div>'
|
||||||
|
f'<div style="font-size:0.83em;color:#444;line-height:1.55;white-space:pre-wrap;'
|
||||||
|
f'background:#fff;padding:10px 12px;border-left:3px solid {ac};border-radius:2px;'
|
||||||
|
f'max-height:300px;overflow-y:auto">{acrit}</div>'
|
||||||
|
f'</td></tr>'
|
||||||
|
)
|
||||||
|
|
||||||
|
chapter_rows += (
|
||||||
|
f'<tr class="chrow" onclick="toggle({cid})" style="cursor:pointer">'
|
||||||
|
f'<td style="font-weight:700;text-align:center">{cid}</td>'
|
||||||
|
f'<td>{ch_title}</td>'
|
||||||
|
f'<td style="color:#666;font-size:0.85em">{pov}</td>'
|
||||||
|
f'<td style="color:#666;font-size:0.85em">{pace}</td>'
|
||||||
|
f'<td style="text-align:right">{actual_w} <span style="color:#aaa">/{target_w}</span></td>'
|
||||||
|
f'<td style="text-align:center;color:#888">{pos_pct}</td>'
|
||||||
|
f'<td style="text-align:center">{threshold}</td>'
|
||||||
|
f'<td style="text-align:center;color:{polish_c}">{polish} <span style="color:#aaa;font-size:0.8em">{fw_dens:.3f}</span></td>'
|
||||||
|
f'<td style="text-align:center;font-weight:700;font-size:1.1em;color:{fs_color}">{fs}</td>'
|
||||||
|
f'<td style="text-align:center;color:#888">{n_att}×</td>'
|
||||||
|
f'<td>{fd_badge}</td>'
|
||||||
|
f'</tr>'
|
||||||
|
f'<tr id="d{cid}" class="detrow">{att_rows}</tr>'
|
||||||
|
)
|
||||||
|
|
||||||
|
# --- Critique patterns ---
|
||||||
|
pat_rows = ''
|
||||||
|
for pattern, data in patterns.items():
|
||||||
|
count = data['count']
|
||||||
|
if count == 0:
|
||||||
|
continue
|
||||||
|
pct = int(count / total * 100) if total else 0
|
||||||
|
sev_color = '#dc3545' if pct >= 50 else '#fd7e14' if pct >= 30 else '#17a2b8'
|
||||||
|
chlist = ', '.join(f'Ch {x}' for x in data['chapters'][:10])
|
||||||
|
if len(data['chapters']) > 10:
|
||||||
|
chlist += f' (+{len(data["chapters"]) - 10} more)'
|
||||||
|
pat_rows += (
|
||||||
|
f'<tr>'
|
||||||
|
f'<td><strong>{pattern}</strong></td>'
|
||||||
|
f'<td style="text-align:center;color:{sev_color};font-weight:700">{count}/{total} ({pct}%)</td>'
|
||||||
|
f'<td style="color:#666;font-size:0.83em">{chlist}</td>'
|
||||||
|
f'</tr>'
|
||||||
|
)
|
||||||
|
if not pat_rows:
|
||||||
|
pat_rows = '<tr><td colspan="3" style="color:#666;text-align:center;padding:12px">No significant patterns detected.</td></tr>'
|
||||||
|
|
||||||
|
# --- Prompt tuning notes ---
|
||||||
|
notes = _generate_prompt_notes(chapters, avg_score, total, full_rewrites, below_threshold, patterns)
|
||||||
|
notes_html = ''.join(f'<li style="margin-bottom:8px;line-height:1.55">{n}</li>' for n in notes)
|
||||||
|
|
||||||
|
return f'''<!DOCTYPE html>
|
||||||
|
<html lang="en">
|
||||||
|
<head>
|
||||||
|
<meta charset="UTF-8">
|
||||||
|
<meta name="viewport" content="width=device-width, initial-scale=1">
|
||||||
|
<title>Eval Report — {title}</title>
|
||||||
|
<style>
|
||||||
|
*{{box-sizing:border-box;margin:0;padding:0}}
|
||||||
|
body{{font-family:-apple-system,BlinkMacSystemFont,"Segoe UI",Roboto,sans-serif;background:#f0f2f5;color:#333;padding:20px}}
|
||||||
|
.wrap{{max-width:1280px;margin:0 auto}}
|
||||||
|
header{{background:#1a1d23;color:#fff;padding:22px 28px;border-radius:10px;margin-bottom:22px}}
|
||||||
|
header h1{{font-size:0.9em;color:#8b92a1;margin-bottom:4px;font-weight:500}}
|
||||||
|
header h2{{font-size:1.9em;font-weight:700;margin-bottom:6px}}
|
||||||
|
header p{{color:#8b92a1;font-size:0.88em}}
|
||||||
|
.cards{{display:grid;grid-template-columns:repeat(auto-fit,minmax(130px,1fr));gap:12px;margin-bottom:20px}}
|
||||||
|
.card{{background:#fff;border-radius:8px;padding:16px;text-align:center;box-shadow:0 1px 3px rgba(0,0,0,.08)}}
|
||||||
|
.card .val{{font-size:2em;font-weight:700}}
|
||||||
|
.card .lbl{{font-size:0.75em;color:#888;margin-top:4px;line-height:1.3}}
|
||||||
|
.two-col{{display:grid;grid-template-columns:1fr 1fr;gap:16px;margin-bottom:16px}}
|
||||||
|
section{{background:#fff;border-radius:8px;padding:20px;margin-bottom:16px;box-shadow:0 1px 3px rgba(0,0,0,.08)}}
|
||||||
|
section h3{{font-size:1em;font-weight:700;border-bottom:2px solid #f0f0f0;padding-bottom:8px;margin-bottom:14px}}
|
||||||
|
table{{width:100%;border-collapse:collapse;font-size:0.86em}}
|
||||||
|
th{{background:#f7f8fa;padding:8px 10px;text-align:left;font-weight:600;color:#555;border-bottom:2px solid #e0e4ea;white-space:nowrap}}
|
||||||
|
td{{padding:8px 10px;border-bottom:1px solid #f0f0f0;vertical-align:middle}}
|
||||||
|
.chrow:hover{{background:#f7f8fa}}
|
||||||
|
.detrow{{display:none}}
|
||||||
|
.legend{{display:flex;gap:14px;flex-wrap:wrap;font-size:0.78em;color:#777;margin-bottom:10px}}
|
||||||
|
.dot{{display:inline-block;width:11px;height:11px;border-radius:50%;vertical-align:middle;margin-right:3px}}
|
||||||
|
ul.notes{{padding-left:20px}}
|
||||||
|
@media(max-width:768px){{.two-col{{grid-template-columns:1fr}}}}
|
||||||
|
</style>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<div class="wrap">
|
||||||
|
|
||||||
|
<header>
|
||||||
|
<h1>BookApp — Evaluation Report</h1>
|
||||||
|
<h2>{title}</h2>
|
||||||
|
<p>Genre: {genre} | Generated: {report_date} | {total} chapter{"s" if total != 1 else ""}</p>
|
||||||
|
</header>
|
||||||
|
|
||||||
|
<div class="cards">
|
||||||
|
<div class="card"><div class="val" style="color:{avg_color}">{avg_score}</div><div class="lbl">Avg Score /10</div></div>
|
||||||
|
<div class="card"><div class="val" style="color:#28a745">{auto_accepted}</div><div class="lbl">Auto-Accepted (8+)</div></div>
|
||||||
|
<div class="card"><div class="val" style="color:#17a2b8">{multi_attempt}</div><div class="lbl">Multi-Attempt</div></div>
|
||||||
|
<div class="card"><div class="val" style="color:#6f42c1">{full_rewrites}</div><div class="lbl">Full Rewrites</div></div>
|
||||||
|
<div class="card"><div class="val" style="color:#dc3545">{below_threshold}</div><div class="lbl">Below Threshold</div></div>
|
||||||
|
<div class="card"><div class="val" style="color:#fd7e14">{polish_applied}</div><div class="lbl">Polish Passes</div></div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="two-col">
|
||||||
|
<section>
|
||||||
|
<h3>📊 Score Timeline</h3>
|
||||||
|
<div class="legend">
|
||||||
|
<span><span class="dot" style="background:#28a745"></span>8–10 Great</span>
|
||||||
|
<span><span class="dot" style="background:#20c997"></span>7–7.9 Good</span>
|
||||||
|
<span><span class="dot" style="background:#ffc107"></span>6–6.9 Passable</span>
|
||||||
|
<span><span class="dot" style="background:#dc3545"></span><6 Fail</span>
|
||||||
|
</div>
|
||||||
|
<div style="overflow-y:auto;max-height:420px;padding-right:4px">{timeline_rows}</div>
|
||||||
|
</section>
|
||||||
|
<section>
|
||||||
|
<h3>📈 Score Distribution</h3>
|
||||||
|
<div style="margin-top:8px">{dist_rows}</div>
|
||||||
|
</section>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<section>
|
||||||
|
<h3>📋 Chapter Breakdown <small style="font-weight:400;color:#888">(click any row to expand critiques)</small></h3>
|
||||||
|
<div style="overflow-x:auto">
|
||||||
|
<table>
|
||||||
|
<thead><tr>
|
||||||
|
<th>#</th><th>Title</th><th>POV</th><th>Pacing</th>
|
||||||
|
<th style="text-align:right">Words</th>
|
||||||
|
<th style="text-align:center">Pos%</th>
|
||||||
|
<th style="text-align:center">Threshold</th>
|
||||||
|
<th style="text-align:center">Polish / FW</th>
|
||||||
|
<th style="text-align:center">Score</th>
|
||||||
|
<th style="text-align:center">Att.</th>
|
||||||
|
<th>Decision</th>
|
||||||
|
</tr></thead>
|
||||||
|
<tbody>{chapter_rows}</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
</section>
|
||||||
|
|
||||||
|
<section>
|
||||||
|
<h3>🔍 Critique Patterns <small style="font-weight:400;color:#888">Keyword frequency across all evaluation critiques — high % = prompt gap</small></h3>
|
||||||
|
<table>
|
||||||
|
<thead><tr><th>Issue Pattern</th><th style="text-align:center">Frequency</th><th>Affected Chapters</th></tr></thead>
|
||||||
|
<tbody>{pat_rows}</tbody>
|
||||||
|
</table>
|
||||||
|
</section>
|
||||||
|
|
||||||
|
<section>
|
||||||
|
<h3>💡 Prompt Tuning Observations</h3>
|
||||||
|
<ul class="notes">{notes_html}</ul>
|
||||||
|
</section>
|
||||||
|
|
||||||
|
</div>
|
||||||
|
<script>
|
||||||
|
function toggle(id){{
|
||||||
|
var r=document.getElementById('d'+id);
|
||||||
|
if(r) r.style.display=(r.style.display==='none'||r.style.display==='')?'table-row':'none';
|
||||||
|
}}
|
||||||
|
document.querySelectorAll('.detrow').forEach(function(r){{r.style.display='none';}});
|
||||||
|
</script>
|
||||||
|
</body>
|
||||||
|
</html>'''
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Auto-observations for prompt tuning
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def _generate_prompt_notes(chapters, avg_score, total, full_rewrites, below_threshold, patterns):
|
||||||
|
notes = []
|
||||||
|
|
||||||
|
# Overall score
|
||||||
|
if avg_score >= 8:
|
||||||
|
notes.append(f"✅ <strong>High average score ({avg_score}/10).</strong> The generation pipeline is performing well. Focus on the few outlier chapters below the threshold.")
|
||||||
|
elif avg_score >= 7:
|
||||||
|
notes.append(f"✓ <strong>Solid average score ({avg_score}/10).</strong> Minor prompt reinforcement should push this above 8. Focus on the most common critique pattern.")
|
||||||
|
elif avg_score >= 6:
|
||||||
|
notes.append(f"⚠ <strong>Average score of {avg_score}/10 is below target.</strong> Strengthen the draft prompt's Deep POV mandate and filter-word removal rules.")
|
||||||
|
else:
|
||||||
|
notes.append(f"🚨 <strong>Low average score ({avg_score}/10).</strong> The core writing prompt needs significant work — review the Deep POV mandate, genre mandates, and consider adding concrete negative examples.")
|
||||||
|
|
||||||
|
# Full rewrite rate
|
||||||
|
if total > 0:
|
||||||
|
rw_pct = int(full_rewrites / total * 100)
|
||||||
|
if rw_pct > 30:
|
||||||
|
notes.append(f"🔄 <strong>High full-rewrite rate ({rw_pct}%, {full_rewrites} triggers).</strong> The initial draft prompt produces too many sub-6 drafts. Add stronger examples or tighten the DEEP_POV_MANDATE and PROSE_RULES sections.")
|
||||||
|
elif rw_pct > 15:
|
||||||
|
notes.append(f"↩ <strong>Moderate full-rewrite rate ({rw_pct}%, {full_rewrites} triggers).</strong> The draft quality could be improved. Check the genre mandates for the types of chapters that rewrite most often.")
|
||||||
|
|
||||||
|
# Below threshold
|
||||||
|
if below_threshold > 0:
|
||||||
|
bt_pct = int(below_threshold / total * 100)
|
||||||
|
notes.append(f"⚠ <strong>{below_threshold} chapter{'s' if below_threshold != 1 else ''} ({bt_pct}%) finished below the quality threshold.</strong> Inspect the individual critiques to see if these cluster by POV, pacing, or story position.")
|
||||||
|
|
||||||
|
# Top critique patterns
|
||||||
|
for pattern, data in list(patterns.items())[:5]:
|
||||||
|
pct = int(data['count'] / total * 100) if total else 0
|
||||||
|
if pct >= 50:
|
||||||
|
notes.append(f"🔴 <strong>'{pattern}' appears in {pct}% of critiques.</strong> This is systemic — the current prompt does not prevent it. Add an explicit enforcement instruction with a concrete example of the wrong pattern and the correct alternative.")
|
||||||
|
elif pct >= 30:
|
||||||
|
notes.append(f"🟡 <strong>'{pattern}' mentioned in {pct}% of critiques.</strong> Consider reinforcing the relevant prompt instruction with a stronger negative example.")
|
||||||
|
|
||||||
|
# Climax vs. early chapter comparison
|
||||||
|
high_scores = [c.get('final_score', 0) for c in chapters if isinstance(c.get('chapter_position'), float) and c['chapter_position'] >= 0.75]
|
||||||
|
low_scores = [c.get('final_score', 0) for c in chapters if isinstance(c.get('chapter_position'), float) and c['chapter_position'] < 0.25]
|
||||||
|
if high_scores and low_scores:
|
||||||
|
avg_climax = round(sum(high_scores) / len(high_scores), 1)
|
||||||
|
avg_early = round(sum(low_scores) / len(low_scores), 1)
|
||||||
|
if avg_climax < avg_early - 0.5:
|
||||||
|
notes.append(f"📅 <strong>Climax chapters average {avg_climax}/10 vs early chapters {avg_early}/10.</strong> The high-stakes scenes underperform. Strengthen the genre mandates for climax pacing and consider adding specific instructions for emotional payoff.")
|
||||||
|
elif avg_climax > avg_early + 0.5:
|
||||||
|
notes.append(f"📅 <strong>Climax chapters outperform early chapters ({avg_climax} vs {avg_early}).</strong> Good — the adaptive threshold and extra attempts are concentrating quality where it matters.")
|
||||||
|
|
||||||
|
# POV character analysis
|
||||||
|
pov_scores = {}
|
||||||
|
for c in chapters:
|
||||||
|
pov = c.get('pov_character') or 'Unknown'
|
||||||
|
s = c.get('final_score', 0)
|
||||||
|
if s > 0:
|
||||||
|
pov_scores.setdefault(pov, []).append(s)
|
||||||
|
for pov, sc in sorted(pov_scores.items(), key=lambda x: sum(x[1]) / len(x[1])):
|
||||||
|
if len(sc) >= 2 and sum(sc) / len(sc) < 6.5:
|
||||||
|
avg_pov = round(sum(sc) / len(sc), 1)
|
||||||
|
notes.append(f"👤 <strong>POV '{pov}' averages {avg_pov}/10.</strong> Consider adding or strengthening a character voice profile for this character, or refining the persona bio to match how this POV character should speak and think.")
|
||||||
|
|
||||||
|
# Pacing analysis
|
||||||
|
pace_scores = {}
|
||||||
|
for c in chapters:
|
||||||
|
pace = c.get('pacing', 'Standard')
|
||||||
|
s = c.get('final_score', 0)
|
||||||
|
if s > 0:
|
||||||
|
pace_scores.setdefault(pace, []).append(s)
|
||||||
|
for pace, sc in pace_scores.items():
|
||||||
|
if len(sc) >= 3 and sum(sc) / len(sc) < 6.5:
|
||||||
|
avg_p = round(sum(sc) / len(sc), 1)
|
||||||
|
notes.append(f"⏩ <strong>'{pace}' pacing chapters average {avg_p}/10.</strong> The writing model struggles with this rhythm. Revisit the PACING_GUIDE instructions for '{pace}' chapters — they may need more concrete direction.")
|
||||||
|
|
||||||
|
if not notes:
|
||||||
|
notes.append("No significant patterns detected. Review the individual chapter critiques for targeted improvements.")
|
||||||
|
return notes
|
||||||
@@ -1,9 +1,11 @@
|
|||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
|
import time
|
||||||
from core import config, utils
|
from core import config, utils
|
||||||
from ai import models as ai_models
|
from ai import models as ai_models
|
||||||
from story.style_persona import get_style_guidelines
|
from story.style_persona import get_style_guidelines
|
||||||
from story.editor import evaluate_chapter_quality
|
from story.editor import evaluate_chapter_quality
|
||||||
|
from story import eval_logger
|
||||||
|
|
||||||
|
|
||||||
def get_genre_instructions(genre):
|
def get_genre_instructions(genre):
|
||||||
@@ -443,6 +445,25 @@ def write_chapter(chap, bp, folder, prev_sum, tracking=None, prev_content=None,
|
|||||||
SCORE_PASSING = 7
|
SCORE_PASSING = 7
|
||||||
SCORE_REWRITE_THRESHOLD = 6
|
SCORE_REWRITE_THRESHOLD = 6
|
||||||
|
|
||||||
|
# Evaluation log entry — written to eval_log.json for the HTML report.
|
||||||
|
_eval_entry = {
|
||||||
|
"ts": time.strftime('%Y-%m-%d %H:%M:%S'),
|
||||||
|
"chapter_num": chap['chapter_number'],
|
||||||
|
"title": chap.get('title', ''),
|
||||||
|
"pov_character": chap.get('pov_character', ''),
|
||||||
|
"pacing": pacing,
|
||||||
|
"target_words": est_words,
|
||||||
|
"actual_words": draft_words,
|
||||||
|
"chapter_position": chapter_position,
|
||||||
|
"score_threshold": SCORE_PASSING,
|
||||||
|
"score_auto_accept": SCORE_AUTO_ACCEPT,
|
||||||
|
"polish_applied": bool(current_text and not _skip_polish),
|
||||||
|
"filter_word_density": round(_fw_density, 4),
|
||||||
|
"attempts": [],
|
||||||
|
"final_score": 0,
|
||||||
|
"final_decision": "unknown",
|
||||||
|
}
|
||||||
|
|
||||||
best_score = 0
|
best_score = 0
|
||||||
best_text = current_text
|
best_text = current_text
|
||||||
past_critiques = []
|
past_critiques = []
|
||||||
@@ -452,16 +473,27 @@ def write_chapter(chap, bp, folder, prev_sum, tracking=None, prev_content=None,
|
|||||||
score, critique = evaluate_chapter_quality(current_text, chap['title'], meta.get('genre', 'Fiction'), ai_models.model_logic, folder, series_context=series_block.strip())
|
score, critique = evaluate_chapter_quality(current_text, chap['title'], meta.get('genre', 'Fiction'), ai_models.model_logic, folder, series_context=series_block.strip())
|
||||||
|
|
||||||
past_critiques.append(f"Attempt {attempt}: {critique}")
|
past_critiques.append(f"Attempt {attempt}: {critique}")
|
||||||
|
_att = {"n": attempt, "score": score, "critique": critique[:700], "decision": None}
|
||||||
|
|
||||||
if "Evaluation error" in critique:
|
if "Evaluation error" in critique:
|
||||||
utils.log("WRITER", f" ⚠️ {critique}. Keeping current draft.")
|
utils.log("WRITER", f" ⚠️ {critique}. Keeping current draft.")
|
||||||
if best_score == 0: best_text = current_text
|
if best_score == 0: best_text = current_text
|
||||||
|
_att["decision"] = "eval_error"
|
||||||
|
_eval_entry["attempts"].append(_att)
|
||||||
|
_eval_entry["final_score"] = best_score
|
||||||
|
_eval_entry["final_decision"] = "eval_error"
|
||||||
|
eval_logger.append_eval_entry(folder, _eval_entry)
|
||||||
break
|
break
|
||||||
|
|
||||||
utils.log("WRITER", f" Score: {score}/10. Critique: {critique}")
|
utils.log("WRITER", f" Score: {score}/10. Critique: {critique}")
|
||||||
|
|
||||||
if score >= SCORE_AUTO_ACCEPT:
|
if score >= SCORE_AUTO_ACCEPT:
|
||||||
utils.log("WRITER", " 🌟 Auto-Accept threshold met.")
|
utils.log("WRITER", " 🌟 Auto-Accept threshold met.")
|
||||||
|
_att["decision"] = "auto_accepted"
|
||||||
|
_eval_entry["attempts"].append(_att)
|
||||||
|
_eval_entry["final_score"] = score
|
||||||
|
_eval_entry["final_decision"] = "auto_accepted"
|
||||||
|
eval_logger.append_eval_entry(folder, _eval_entry)
|
||||||
return current_text
|
return current_text
|
||||||
|
|
||||||
if score > best_score:
|
if score > best_score:
|
||||||
@@ -471,9 +503,19 @@ def write_chapter(chap, bp, folder, prev_sum, tracking=None, prev_content=None,
|
|||||||
if attempt == max_attempts:
|
if attempt == max_attempts:
|
||||||
if best_score >= SCORE_PASSING:
|
if best_score >= SCORE_PASSING:
|
||||||
utils.log("WRITER", f" ✅ Max attempts reached. Accepting best score ({best_score}).")
|
utils.log("WRITER", f" ✅ Max attempts reached. Accepting best score ({best_score}).")
|
||||||
|
_att["decision"] = "accepted"
|
||||||
|
_eval_entry["attempts"].append(_att)
|
||||||
|
_eval_entry["final_score"] = best_score
|
||||||
|
_eval_entry["final_decision"] = "accepted"
|
||||||
|
eval_logger.append_eval_entry(folder, _eval_entry)
|
||||||
return best_text
|
return best_text
|
||||||
else:
|
else:
|
||||||
utils.log("WRITER", f" ⚠️ Quality low ({best_score}/{SCORE_PASSING}) but max attempts reached. Proceeding.")
|
utils.log("WRITER", f" ⚠️ Quality low ({best_score}/{SCORE_PASSING}) but max attempts reached. Proceeding.")
|
||||||
|
_att["decision"] = "below_threshold"
|
||||||
|
_eval_entry["attempts"].append(_att)
|
||||||
|
_eval_entry["final_score"] = best_score
|
||||||
|
_eval_entry["final_decision"] = "below_threshold"
|
||||||
|
eval_logger.append_eval_entry(folder, _eval_entry)
|
||||||
return best_text
|
return best_text
|
||||||
|
|
||||||
if score < SCORE_REWRITE_THRESHOLD:
|
if score < SCORE_REWRITE_THRESHOLD:
|
||||||
@@ -495,10 +537,17 @@ def write_chapter(chap, bp, folder, prev_sum, tracking=None, prev_content=None,
|
|||||||
utils.log_usage(folder, ai_models.model_logic.name, resp_rewrite.usage_metadata)
|
utils.log_usage(folder, ai_models.model_logic.name, resp_rewrite.usage_metadata)
|
||||||
current_text = resp_rewrite.text
|
current_text = resp_rewrite.text
|
||||||
ai_models.model_logic.update(ai_models.logic_model_name)
|
ai_models.model_logic.update(ai_models.logic_model_name)
|
||||||
|
_att["decision"] = "full_rewrite"
|
||||||
|
_eval_entry["attempts"].append(_att)
|
||||||
continue
|
continue
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
ai_models.model_logic.update(ai_models.logic_model_name)
|
ai_models.model_logic.update(ai_models.logic_model_name)
|
||||||
utils.log("WRITER", f"Full rewrite failed: {e}. Falling back to refinement.")
|
utils.log("WRITER", f"Full rewrite failed: {e}. Falling back to refinement.")
|
||||||
|
_att["decision"] = "full_rewrite_failed"
|
||||||
|
# fall through to refinement; decision will be overwritten below
|
||||||
|
|
||||||
|
else:
|
||||||
|
_att["decision"] = "refinement"
|
||||||
|
|
||||||
utils.log("WRITER", f" -> Refining Ch {chap['chapter_number']} based on feedback...")
|
utils.log("WRITER", f" -> Refining Ch {chap['chapter_number']} based on feedback...")
|
||||||
|
|
||||||
@@ -553,8 +602,21 @@ def write_chapter(chap, bp, folder, prev_sum, tracking=None, prev_content=None,
|
|||||||
resp_refine = ai_models.model_writer.generate_content(refine_prompt)
|
resp_refine = ai_models.model_writer.generate_content(refine_prompt)
|
||||||
utils.log_usage(folder, ai_models.model_writer.name, resp_refine.usage_metadata)
|
utils.log_usage(folder, ai_models.model_writer.name, resp_refine.usage_metadata)
|
||||||
current_text = resp_refine.text
|
current_text = resp_refine.text
|
||||||
|
if _att["decision"] == "full_rewrite_failed":
|
||||||
|
_att["decision"] = "refinement" # rewrite failed, fell back to refinement
|
||||||
|
_eval_entry["attempts"].append(_att)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
utils.log("WRITER", f"Refinement failed: {e}")
|
utils.log("WRITER", f"Refinement failed: {e}")
|
||||||
|
_att["decision"] = "refinement_failed"
|
||||||
|
_eval_entry["attempts"].append(_att)
|
||||||
|
_eval_entry["final_score"] = best_score
|
||||||
|
_eval_entry["final_decision"] = "refinement_failed"
|
||||||
|
eval_logger.append_eval_entry(folder, _eval_entry)
|
||||||
return best_text
|
return best_text
|
||||||
|
|
||||||
|
# Reached only if eval_error break occurred; write log before returning.
|
||||||
|
if _eval_entry["final_decision"] == "unknown":
|
||||||
|
_eval_entry["final_score"] = best_score
|
||||||
|
_eval_entry["final_decision"] = "best_available"
|
||||||
|
eval_logger.append_eval_entry(folder, _eval_entry)
|
||||||
return best_text
|
return best_text
|
||||||
|
|||||||
@@ -208,6 +208,9 @@
|
|||||||
<a href="{{ url_for('run.check_consistency', run_id=run.id, book_folder=book.folder) }}" class="btn btn-outline-warning ms-2">
|
<a href="{{ url_for('run.check_consistency', run_id=run.id, book_folder=book.folder) }}" class="btn btn-outline-warning ms-2">
|
||||||
<i class="fas fa-search me-2"></i>Check Consistency
|
<i class="fas fa-search me-2"></i>Check Consistency
|
||||||
</a>
|
</a>
|
||||||
|
<a href="{{ url_for('run.eval_report', run_id=run.id, book_folder=book.folder) }}" class="btn btn-outline-info ms-2" title="Download evaluation report (scores, critiques, prompt tuning notes)">
|
||||||
|
<i class="fas fa-chart-bar me-2"></i>Eval Report
|
||||||
|
</a>
|
||||||
<button class="btn btn-warning ms-2" data-bs-toggle="modal" data-bs-target="#reviseBookModal{{ loop.index }}" title="Regenerate this book with changes, keeping others.">
|
<button class="btn btn-warning ms-2" data-bs-toggle="modal" data-bs-target="#reviseBookModal{{ loop.index }}" title="Regenerate this book with changes, keeping others.">
|
||||||
<i class="fas fa-pencil-alt me-2"></i>Revise
|
<i class="fas fa-pencil-alt me-2"></i>Revise
|
||||||
</button>
|
</button>
|
||||||
|
|||||||
@@ -10,7 +10,7 @@ from core import utils
|
|||||||
from ai import models as ai_models
|
from ai import models as ai_models
|
||||||
from ai import setup as ai_setup
|
from ai import setup as ai_setup
|
||||||
from story import editor as story_editor
|
from story import editor as story_editor
|
||||||
from story import bible_tracker, style_persona
|
from story import bible_tracker, style_persona, eval_logger as story_eval_logger
|
||||||
from export import exporter
|
from export import exporter
|
||||||
from web.tasks import huey, regenerate_artifacts_task, rewrite_chapter_task
|
from web.tasks import huey, regenerate_artifacts_task, rewrite_chapter_task
|
||||||
|
|
||||||
@@ -434,6 +434,45 @@ def delete_run(id):
|
|||||||
return redirect(url_for('project.view_project', id=project_id))
|
return redirect(url_for('project.view_project', id=project_id))
|
||||||
|
|
||||||
|
|
||||||
|
@run_bp.route('/project/<int:run_id>/eval_report/<string:book_folder>')
|
||||||
|
@login_required
|
||||||
|
def eval_report(run_id, book_folder):
|
||||||
|
"""Generate and download the self-contained HTML evaluation report."""
|
||||||
|
run = db.session.get(Run, run_id) or Run.query.get_or_404(run_id)
|
||||||
|
if run.project.user_id != current_user.id:
|
||||||
|
return "Unauthorized", 403
|
||||||
|
|
||||||
|
if not book_folder or "/" in book_folder or "\\" in book_folder or ".." in book_folder:
|
||||||
|
return "Invalid book folder", 400
|
||||||
|
|
||||||
|
run_dir = os.path.join(run.project.folder_path, "runs", f"run_{run.id}")
|
||||||
|
book_path = os.path.join(run_dir, book_folder)
|
||||||
|
|
||||||
|
bp = utils.load_json(os.path.join(book_path, "final_blueprint.json")) or \
|
||||||
|
utils.load_json(os.path.join(book_path, "blueprint_initial.json"))
|
||||||
|
|
||||||
|
html = story_eval_logger.generate_html_report(book_path, bp)
|
||||||
|
if not html:
|
||||||
|
return (
|
||||||
|
"<html><body style='font-family:sans-serif;padding:40px'>"
|
||||||
|
"<h2>No evaluation data yet.</h2>"
|
||||||
|
"<p>The evaluation report is generated during the writing phase. "
|
||||||
|
"Start a generation run and the report will be available once chapters have been evaluated.</p>"
|
||||||
|
"</body></html>"
|
||||||
|
), 200
|
||||||
|
|
||||||
|
from flask import Response
|
||||||
|
safe_title = utils.sanitize_filename(
|
||||||
|
(bp or {}).get('book_metadata', {}).get('title', book_folder) or book_folder
|
||||||
|
)[:40]
|
||||||
|
filename = f"eval_report_{safe_title}.html"
|
||||||
|
return Response(
|
||||||
|
html,
|
||||||
|
mimetype='text/html',
|
||||||
|
headers={'Content-Disposition': f'attachment; filename="{filename}"'}
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
@run_bp.route('/run/<int:id>/download_bible')
|
@run_bp.route('/run/<int:id>/download_bible')
|
||||||
@login_required
|
@login_required
|
||||||
def download_bible(id):
|
def download_bible(id):
|
||||||
|
|||||||
Reference in New Issue
Block a user