feat: Add evaluation report pipeline for prompt tuning feedback

Adds a full per-chapter evaluation logging system that captures every
score, critique, and quality decision made during writing, then renders
a self-contained HTML report shareable with critics or prompt engineers.

New file — story/eval_logger.py:
- append_eval_entry(folder, entry): writes per-chapter eval data to
  eval_log.json in the book folder (called from write_chapter() at
  every return point).
- generate_html_report(folder, bp): reads eval_log.json and produces a
  self-contained HTML file (no external deps) with:
    • Summary cards (avg score, auto-accepted, rewrites, below-threshold)
    • Score timeline bar chart (one bar per chapter, colour-coded)
    • Score distribution histogram
    • Chapter breakdown table with expand-on-click critique details
      (attempt number, score, decision badge, full critique text)
    • Critique pattern frequency table (keyword mining across all critiques)
    • Auto-generated prompt tuning observations (systemic issues, POV
      character weak spots, pacing type analysis, climax vs. early
      chapter comparison)

story/writer.py:
- Imports time and eval_logger.
- Initialises _eval_entry dict (chapter metadata + polish flags + thresholds)
  after all threshold variables are set.
- Records each evaluation attempt's score, critique (truncated to 700 chars),
  and decision (auto_accepted / full_rewrite / refinement / accepted /
  below_threshold / eval_error / refinement_failed) before every return.

web/routes/run.py:
- Imports story_eval_logger.
- New route GET /project/<run_id>/eval_report/<book_folder>: loads
  eval_log.json, calls generate_html_report(), returns the HTML as a
  downloadable attachment named eval_report_<title>.html.
  Returns a user-friendly "not yet available" page if no log exists.

templates/run_details.html:
- Adds "Eval Report" (btn-outline-info) button next to "Check Consistency"
  in each book's artifact section.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-02-24 08:03:32 -05:00
parent d2c65f010a
commit f869700070
4 changed files with 578 additions and 1 deletions

View File

@@ -1,9 +1,11 @@
import json
import os
import time
from core import config, utils
from ai import models as ai_models
from story.style_persona import get_style_guidelines
from story.editor import evaluate_chapter_quality
from story import eval_logger
def get_genre_instructions(genre):
@@ -443,6 +445,25 @@ def write_chapter(chap, bp, folder, prev_sum, tracking=None, prev_content=None,
SCORE_PASSING = 7
SCORE_REWRITE_THRESHOLD = 6
# Evaluation log entry — written to eval_log.json for the HTML report.
_eval_entry = {
"ts": time.strftime('%Y-%m-%d %H:%M:%S'),
"chapter_num": chap['chapter_number'],
"title": chap.get('title', ''),
"pov_character": chap.get('pov_character', ''),
"pacing": pacing,
"target_words": est_words,
"actual_words": draft_words,
"chapter_position": chapter_position,
"score_threshold": SCORE_PASSING,
"score_auto_accept": SCORE_AUTO_ACCEPT,
"polish_applied": bool(current_text and not _skip_polish),
"filter_word_density": round(_fw_density, 4),
"attempts": [],
"final_score": 0,
"final_decision": "unknown",
}
best_score = 0
best_text = current_text
past_critiques = []
@@ -452,16 +473,27 @@ def write_chapter(chap, bp, folder, prev_sum, tracking=None, prev_content=None,
score, critique = evaluate_chapter_quality(current_text, chap['title'], meta.get('genre', 'Fiction'), ai_models.model_logic, folder, series_context=series_block.strip())
past_critiques.append(f"Attempt {attempt}: {critique}")
_att = {"n": attempt, "score": score, "critique": critique[:700], "decision": None}
if "Evaluation error" in critique:
utils.log("WRITER", f" ⚠️ {critique}. Keeping current draft.")
if best_score == 0: best_text = current_text
_att["decision"] = "eval_error"
_eval_entry["attempts"].append(_att)
_eval_entry["final_score"] = best_score
_eval_entry["final_decision"] = "eval_error"
eval_logger.append_eval_entry(folder, _eval_entry)
break
utils.log("WRITER", f" Score: {score}/10. Critique: {critique}")
if score >= SCORE_AUTO_ACCEPT:
utils.log("WRITER", " 🌟 Auto-Accept threshold met.")
_att["decision"] = "auto_accepted"
_eval_entry["attempts"].append(_att)
_eval_entry["final_score"] = score
_eval_entry["final_decision"] = "auto_accepted"
eval_logger.append_eval_entry(folder, _eval_entry)
return current_text
if score > best_score:
@@ -471,9 +503,19 @@ def write_chapter(chap, bp, folder, prev_sum, tracking=None, prev_content=None,
if attempt == max_attempts:
if best_score >= SCORE_PASSING:
utils.log("WRITER", f" ✅ Max attempts reached. Accepting best score ({best_score}).")
_att["decision"] = "accepted"
_eval_entry["attempts"].append(_att)
_eval_entry["final_score"] = best_score
_eval_entry["final_decision"] = "accepted"
eval_logger.append_eval_entry(folder, _eval_entry)
return best_text
else:
utils.log("WRITER", f" ⚠️ Quality low ({best_score}/{SCORE_PASSING}) but max attempts reached. Proceeding.")
_att["decision"] = "below_threshold"
_eval_entry["attempts"].append(_att)
_eval_entry["final_score"] = best_score
_eval_entry["final_decision"] = "below_threshold"
eval_logger.append_eval_entry(folder, _eval_entry)
return best_text
if score < SCORE_REWRITE_THRESHOLD:
@@ -495,10 +537,17 @@ def write_chapter(chap, bp, folder, prev_sum, tracking=None, prev_content=None,
utils.log_usage(folder, ai_models.model_logic.name, resp_rewrite.usage_metadata)
current_text = resp_rewrite.text
ai_models.model_logic.update(ai_models.logic_model_name)
_att["decision"] = "full_rewrite"
_eval_entry["attempts"].append(_att)
continue
except Exception as e:
ai_models.model_logic.update(ai_models.logic_model_name)
utils.log("WRITER", f"Full rewrite failed: {e}. Falling back to refinement.")
_att["decision"] = "full_rewrite_failed"
# fall through to refinement; decision will be overwritten below
else:
_att["decision"] = "refinement"
utils.log("WRITER", f" -> Refining Ch {chap['chapter_number']} based on feedback...")
@@ -553,8 +602,21 @@ def write_chapter(chap, bp, folder, prev_sum, tracking=None, prev_content=None,
resp_refine = ai_models.model_writer.generate_content(refine_prompt)
utils.log_usage(folder, ai_models.model_writer.name, resp_refine.usage_metadata)
current_text = resp_refine.text
if _att["decision"] == "full_rewrite_failed":
_att["decision"] = "refinement" # rewrite failed, fell back to refinement
_eval_entry["attempts"].append(_att)
except Exception as e:
utils.log("WRITER", f"Refinement failed: {e}")
_att["decision"] = "refinement_failed"
_eval_entry["attempts"].append(_att)
_eval_entry["final_score"] = best_score
_eval_entry["final_decision"] = "refinement_failed"
eval_logger.append_eval_entry(folder, _eval_entry)
return best_text
# Reached only if eval_error break occurred; write log before returning.
if _eval_entry["final_decision"] == "unknown":
_eval_entry["final_score"] = best_score
_eval_entry["final_decision"] = "best_available"
eval_logger.append_eval_entry(folder, _eval_entry)
return best_text