' for n in notes)
+
+ return f'''
+
+
+
+
+Eval Report — {title}
+
+
+
+
+
+
+
BookApp — Evaluation Report
+
{title}
+
Genre: {genre} | Generated: {report_date} | {total} chapter{"s" if total != 1 else ""}
+
+
+
+
{avg_score}
Avg Score /10
+
{auto_accepted}
Auto-Accepted (8+)
+
{multi_attempt}
Multi-Attempt
+
{full_rewrites}
Full Rewrites
+
{below_threshold}
Below Threshold
+
{polish_applied}
Polish Passes
+
+
+
+
+
📊 Score Timeline
+
+ 8–10 Great
+ 7–7.9 Good
+ 6–6.9 Passable
+ <6 Fail
+
+
{timeline_rows}
+
+
+
📈 Score Distribution
+
{dist_rows}
+
+
+
+
+
📋 Chapter Breakdown (click any row to expand critiques)
+
+
+
+
#
Title
POV
Pacing
+
Words
+
Pos%
+
Threshold
+
Polish / FW
+
Score
+
Att.
+
Decision
+
+ {chapter_rows}
+
+
+
+
+
+
🔍 Critique Patterns Keyword frequency across all evaluation critiques — high % = prompt gap
+
+
Issue Pattern
Frequency
Affected Chapters
+ {pat_rows}
+
+
+
+
+
💡 Prompt Tuning Observations
+
{notes_html}
+
+
+
+
+
+'''
+
+
+# ---------------------------------------------------------------------------
+# Auto-observations for prompt tuning
+# ---------------------------------------------------------------------------
+
+def _generate_prompt_notes(chapters, avg_score, total, full_rewrites, below_threshold, patterns):
+ notes = []
+
+ # Overall score
+ if avg_score >= 8:
+ notes.append(f"✅ High average score ({avg_score}/10). The generation pipeline is performing well. Focus on the few outlier chapters below the threshold.")
+ elif avg_score >= 7:
+ notes.append(f"✓ Solid average score ({avg_score}/10). Minor prompt reinforcement should push this above 8. Focus on the most common critique pattern.")
+ elif avg_score >= 6:
+ notes.append(f"⚠ Average score of {avg_score}/10 is below target. Strengthen the draft prompt's Deep POV mandate and filter-word removal rules.")
+ else:
+ notes.append(f"🚨 Low average score ({avg_score}/10). The core writing prompt needs significant work — review the Deep POV mandate, genre mandates, and consider adding concrete negative examples.")
+
+ # Full rewrite rate
+ if total > 0:
+ rw_pct = int(full_rewrites / total * 100)
+ if rw_pct > 30:
+ notes.append(f"🔄 High full-rewrite rate ({rw_pct}%, {full_rewrites} triggers). The initial draft prompt produces too many sub-6 drafts. Add stronger examples or tighten the DEEP_POV_MANDATE and PROSE_RULES sections.")
+ elif rw_pct > 15:
+ notes.append(f"↩ Moderate full-rewrite rate ({rw_pct}%, {full_rewrites} triggers). The draft quality could be improved. Check the genre mandates for the types of chapters that rewrite most often.")
+
+ # Below threshold
+ if below_threshold > 0:
+ bt_pct = int(below_threshold / total * 100)
+ notes.append(f"⚠ {below_threshold} chapter{'s' if below_threshold != 1 else ''} ({bt_pct}%) finished below the quality threshold. Inspect the individual critiques to see if these cluster by POV, pacing, or story position.")
+
+ # Top critique patterns
+ for pattern, data in list(patterns.items())[:5]:
+ pct = int(data['count'] / total * 100) if total else 0
+ if pct >= 50:
+ notes.append(f"🔴 '{pattern}' appears in {pct}% of critiques. This is systemic — the current prompt does not prevent it. Add an explicit enforcement instruction with a concrete example of the wrong pattern and the correct alternative.")
+ elif pct >= 30:
+ notes.append(f"🟡 '{pattern}' mentioned in {pct}% of critiques. Consider reinforcing the relevant prompt instruction with a stronger negative example.")
+
+ # Climax vs. early chapter comparison
+ high_scores = [c.get('final_score', 0) for c in chapters if isinstance(c.get('chapter_position'), float) and c['chapter_position'] >= 0.75]
+ low_scores = [c.get('final_score', 0) for c in chapters if isinstance(c.get('chapter_position'), float) and c['chapter_position'] < 0.25]
+ if high_scores and low_scores:
+ avg_climax = round(sum(high_scores) / len(high_scores), 1)
+ avg_early = round(sum(low_scores) / len(low_scores), 1)
+ if avg_climax < avg_early - 0.5:
+ notes.append(f"📅 Climax chapters average {avg_climax}/10 vs early chapters {avg_early}/10. The high-stakes scenes underperform. Strengthen the genre mandates for climax pacing and consider adding specific instructions for emotional payoff.")
+ elif avg_climax > avg_early + 0.5:
+ notes.append(f"📅 Climax chapters outperform early chapters ({avg_climax} vs {avg_early}). Good — the adaptive threshold and extra attempts are concentrating quality where it matters.")
+
+ # POV character analysis
+ pov_scores = {}
+ for c in chapters:
+ pov = c.get('pov_character') or 'Unknown'
+ s = c.get('final_score', 0)
+ if s > 0:
+ pov_scores.setdefault(pov, []).append(s)
+ for pov, sc in sorted(pov_scores.items(), key=lambda x: sum(x[1]) / len(x[1])):
+ if len(sc) >= 2 and sum(sc) / len(sc) < 6.5:
+ avg_pov = round(sum(sc) / len(sc), 1)
+ notes.append(f"👤 POV '{pov}' averages {avg_pov}/10. Consider adding or strengthening a character voice profile for this character, or refining the persona bio to match how this POV character should speak and think.")
+
+ # Pacing analysis
+ pace_scores = {}
+ for c in chapters:
+ pace = c.get('pacing', 'Standard')
+ s = c.get('final_score', 0)
+ if s > 0:
+ pace_scores.setdefault(pace, []).append(s)
+ for pace, sc in pace_scores.items():
+ if len(sc) >= 3 and sum(sc) / len(sc) < 6.5:
+ avg_p = round(sum(sc) / len(sc), 1)
+ notes.append(f"⏩ '{pace}' pacing chapters average {avg_p}/10. The writing model struggles with this rhythm. Revisit the PACING_GUIDE instructions for '{pace}' chapters — they may need more concrete direction.")
+
+ if not notes:
+ notes.append("No significant patterns detected. Review the individual chapter critiques for targeted improvements.")
+ return notes
diff --git a/story/writer.py b/story/writer.py
index 021ba6f..957d05a 100644
--- a/story/writer.py
+++ b/story/writer.py
@@ -1,9 +1,11 @@
import json
import os
+import time
from core import config, utils
from ai import models as ai_models
from story.style_persona import get_style_guidelines
from story.editor import evaluate_chapter_quality
+from story import eval_logger
def get_genre_instructions(genre):
@@ -443,6 +445,25 @@ def write_chapter(chap, bp, folder, prev_sum, tracking=None, prev_content=None,
SCORE_PASSING = 7
SCORE_REWRITE_THRESHOLD = 6
+ # Evaluation log entry — written to eval_log.json for the HTML report.
+ _eval_entry = {
+ "ts": time.strftime('%Y-%m-%d %H:%M:%S'),
+ "chapter_num": chap['chapter_number'],
+ "title": chap.get('title', ''),
+ "pov_character": chap.get('pov_character', ''),
+ "pacing": pacing,
+ "target_words": est_words,
+ "actual_words": draft_words,
+ "chapter_position": chapter_position,
+ "score_threshold": SCORE_PASSING,
+ "score_auto_accept": SCORE_AUTO_ACCEPT,
+ "polish_applied": bool(current_text and not _skip_polish),
+ "filter_word_density": round(_fw_density, 4),
+ "attempts": [],
+ "final_score": 0,
+ "final_decision": "unknown",
+ }
+
best_score = 0
best_text = current_text
past_critiques = []
@@ -452,16 +473,27 @@ def write_chapter(chap, bp, folder, prev_sum, tracking=None, prev_content=None,
score, critique = evaluate_chapter_quality(current_text, chap['title'], meta.get('genre', 'Fiction'), ai_models.model_logic, folder, series_context=series_block.strip())
past_critiques.append(f"Attempt {attempt}: {critique}")
+ _att = {"n": attempt, "score": score, "critique": critique[:700], "decision": None}
if "Evaluation error" in critique:
utils.log("WRITER", f" ⚠️ {critique}. Keeping current draft.")
if best_score == 0: best_text = current_text
+ _att["decision"] = "eval_error"
+ _eval_entry["attempts"].append(_att)
+ _eval_entry["final_score"] = best_score
+ _eval_entry["final_decision"] = "eval_error"
+ eval_logger.append_eval_entry(folder, _eval_entry)
break
utils.log("WRITER", f" Score: {score}/10. Critique: {critique}")
if score >= SCORE_AUTO_ACCEPT:
utils.log("WRITER", " 🌟 Auto-Accept threshold met.")
+ _att["decision"] = "auto_accepted"
+ _eval_entry["attempts"].append(_att)
+ _eval_entry["final_score"] = score
+ _eval_entry["final_decision"] = "auto_accepted"
+ eval_logger.append_eval_entry(folder, _eval_entry)
return current_text
if score > best_score:
@@ -471,9 +503,19 @@ def write_chapter(chap, bp, folder, prev_sum, tracking=None, prev_content=None,
if attempt == max_attempts:
if best_score >= SCORE_PASSING:
utils.log("WRITER", f" ✅ Max attempts reached. Accepting best score ({best_score}).")
+ _att["decision"] = "accepted"
+ _eval_entry["attempts"].append(_att)
+ _eval_entry["final_score"] = best_score
+ _eval_entry["final_decision"] = "accepted"
+ eval_logger.append_eval_entry(folder, _eval_entry)
return best_text
else:
utils.log("WRITER", f" ⚠️ Quality low ({best_score}/{SCORE_PASSING}) but max attempts reached. Proceeding.")
+ _att["decision"] = "below_threshold"
+ _eval_entry["attempts"].append(_att)
+ _eval_entry["final_score"] = best_score
+ _eval_entry["final_decision"] = "below_threshold"
+ eval_logger.append_eval_entry(folder, _eval_entry)
return best_text
if score < SCORE_REWRITE_THRESHOLD:
@@ -495,10 +537,17 @@ def write_chapter(chap, bp, folder, prev_sum, tracking=None, prev_content=None,
utils.log_usage(folder, ai_models.model_logic.name, resp_rewrite.usage_metadata)
current_text = resp_rewrite.text
ai_models.model_logic.update(ai_models.logic_model_name)
+ _att["decision"] = "full_rewrite"
+ _eval_entry["attempts"].append(_att)
continue
except Exception as e:
ai_models.model_logic.update(ai_models.logic_model_name)
utils.log("WRITER", f"Full rewrite failed: {e}. Falling back to refinement.")
+ _att["decision"] = "full_rewrite_failed"
+ # fall through to refinement; decision will be overwritten below
+
+ else:
+ _att["decision"] = "refinement"
utils.log("WRITER", f" -> Refining Ch {chap['chapter_number']} based on feedback...")
@@ -553,8 +602,21 @@ def write_chapter(chap, bp, folder, prev_sum, tracking=None, prev_content=None,
resp_refine = ai_models.model_writer.generate_content(refine_prompt)
utils.log_usage(folder, ai_models.model_writer.name, resp_refine.usage_metadata)
current_text = resp_refine.text
+ if _att["decision"] == "full_rewrite_failed":
+ _att["decision"] = "refinement" # rewrite failed, fell back to refinement
+ _eval_entry["attempts"].append(_att)
except Exception as e:
utils.log("WRITER", f"Refinement failed: {e}")
+ _att["decision"] = "refinement_failed"
+ _eval_entry["attempts"].append(_att)
+ _eval_entry["final_score"] = best_score
+ _eval_entry["final_decision"] = "refinement_failed"
+ eval_logger.append_eval_entry(folder, _eval_entry)
return best_text
+ # Reached only if eval_error break occurred; write log before returning.
+ if _eval_entry["final_decision"] == "unknown":
+ _eval_entry["final_score"] = best_score
+ _eval_entry["final_decision"] = "best_available"
+ eval_logger.append_eval_entry(folder, _eval_entry)
return best_text
diff --git a/templates/run_details.html b/templates/run_details.html
index 8102ea2..c0400af 100644
--- a/templates/run_details.html
+++ b/templates/run_details.html
@@ -208,6 +208,9 @@
Check Consistency
+
+ Eval Report
+
diff --git a/web/routes/run.py b/web/routes/run.py
index fdd7118..5b8fdda 100644
--- a/web/routes/run.py
+++ b/web/routes/run.py
@@ -10,7 +10,7 @@ from core import utils
from ai import models as ai_models
from ai import setup as ai_setup
from story import editor as story_editor
-from story import bible_tracker, style_persona
+from story import bible_tracker, style_persona, eval_logger as story_eval_logger
from export import exporter
from web.tasks import huey, regenerate_artifacts_task, rewrite_chapter_task
@@ -434,6 +434,45 @@ def delete_run(id):
return redirect(url_for('project.view_project', id=project_id))
+@run_bp.route('/project//eval_report/')
+@login_required
+def eval_report(run_id, book_folder):
+ """Generate and download the self-contained HTML evaluation report."""
+ run = db.session.get(Run, run_id) or Run.query.get_or_404(run_id)
+ if run.project.user_id != current_user.id:
+ return "Unauthorized", 403
+
+ if not book_folder or "/" in book_folder or "\\" in book_folder or ".." in book_folder:
+ return "Invalid book folder", 400
+
+ run_dir = os.path.join(run.project.folder_path, "runs", f"run_{run.id}")
+ book_path = os.path.join(run_dir, book_folder)
+
+ bp = utils.load_json(os.path.join(book_path, "final_blueprint.json")) or \
+ utils.load_json(os.path.join(book_path, "blueprint_initial.json"))
+
+ html = story_eval_logger.generate_html_report(book_path, bp)
+ if not html:
+ return (
+ ""
+ "
No evaluation data yet.
"
+ "
The evaluation report is generated during the writing phase. "
+ "Start a generation run and the report will be available once chapters have been evaluated.