feat: Add evaluation report pipeline for prompt tuning feedback
Adds a full per-chapter evaluation logging system that captures every
score, critique, and quality decision made during writing, then renders
a self-contained HTML report shareable with critics or prompt engineers.
New file — story/eval_logger.py:
- append_eval_entry(folder, entry): writes per-chapter eval data to
eval_log.json in the book folder (called from write_chapter() at
every return point).
- generate_html_report(folder, bp): reads eval_log.json and produces a
self-contained HTML file (no external deps) with:
• Summary cards (avg score, auto-accepted, rewrites, below-threshold)
• Score timeline bar chart (one bar per chapter, colour-coded)
• Score distribution histogram
• Chapter breakdown table with expand-on-click critique details
(attempt number, score, decision badge, full critique text)
• Critique pattern frequency table (keyword mining across all critiques)
• Auto-generated prompt tuning observations (systemic issues, POV
character weak spots, pacing type analysis, climax vs. early
chapter comparison)
story/writer.py:
- Imports time and eval_logger.
- Initialises _eval_entry dict (chapter metadata + polish flags + thresholds)
after all threshold variables are set.
- Records each evaluation attempt's score, critique (truncated to 700 chars),
and decision (auto_accepted / full_rewrite / refinement / accepted /
below_threshold / eval_error / refinement_failed) before every return.
web/routes/run.py:
- Imports story_eval_logger.
- New route GET /project/<run_id>/eval_report/<book_folder>: loads
eval_log.json, calls generate_html_report(), returns the HTML as a
downloadable attachment named eval_report_<title>.html.
Returns a user-friendly "not yet available" page if no log exists.
templates/run_details.html:
- Adds "Eval Report" (btn-outline-info) button next to "Check Consistency"
in each book's artifact section.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -1,9 +1,11 @@
|
||||
import json
|
||||
import os
|
||||
import time
|
||||
from core import config, utils
|
||||
from ai import models as ai_models
|
||||
from story.style_persona import get_style_guidelines
|
||||
from story.editor import evaluate_chapter_quality
|
||||
from story import eval_logger
|
||||
|
||||
|
||||
def get_genre_instructions(genre):
|
||||
@@ -443,6 +445,25 @@ def write_chapter(chap, bp, folder, prev_sum, tracking=None, prev_content=None,
|
||||
SCORE_PASSING = 7
|
||||
SCORE_REWRITE_THRESHOLD = 6
|
||||
|
||||
# Evaluation log entry — written to eval_log.json for the HTML report.
|
||||
_eval_entry = {
|
||||
"ts": time.strftime('%Y-%m-%d %H:%M:%S'),
|
||||
"chapter_num": chap['chapter_number'],
|
||||
"title": chap.get('title', ''),
|
||||
"pov_character": chap.get('pov_character', ''),
|
||||
"pacing": pacing,
|
||||
"target_words": est_words,
|
||||
"actual_words": draft_words,
|
||||
"chapter_position": chapter_position,
|
||||
"score_threshold": SCORE_PASSING,
|
||||
"score_auto_accept": SCORE_AUTO_ACCEPT,
|
||||
"polish_applied": bool(current_text and not _skip_polish),
|
||||
"filter_word_density": round(_fw_density, 4),
|
||||
"attempts": [],
|
||||
"final_score": 0,
|
||||
"final_decision": "unknown",
|
||||
}
|
||||
|
||||
best_score = 0
|
||||
best_text = current_text
|
||||
past_critiques = []
|
||||
@@ -452,16 +473,27 @@ def write_chapter(chap, bp, folder, prev_sum, tracking=None, prev_content=None,
|
||||
score, critique = evaluate_chapter_quality(current_text, chap['title'], meta.get('genre', 'Fiction'), ai_models.model_logic, folder, series_context=series_block.strip())
|
||||
|
||||
past_critiques.append(f"Attempt {attempt}: {critique}")
|
||||
_att = {"n": attempt, "score": score, "critique": critique[:700], "decision": None}
|
||||
|
||||
if "Evaluation error" in critique:
|
||||
utils.log("WRITER", f" ⚠️ {critique}. Keeping current draft.")
|
||||
if best_score == 0: best_text = current_text
|
||||
_att["decision"] = "eval_error"
|
||||
_eval_entry["attempts"].append(_att)
|
||||
_eval_entry["final_score"] = best_score
|
||||
_eval_entry["final_decision"] = "eval_error"
|
||||
eval_logger.append_eval_entry(folder, _eval_entry)
|
||||
break
|
||||
|
||||
utils.log("WRITER", f" Score: {score}/10. Critique: {critique}")
|
||||
|
||||
if score >= SCORE_AUTO_ACCEPT:
|
||||
utils.log("WRITER", " 🌟 Auto-Accept threshold met.")
|
||||
_att["decision"] = "auto_accepted"
|
||||
_eval_entry["attempts"].append(_att)
|
||||
_eval_entry["final_score"] = score
|
||||
_eval_entry["final_decision"] = "auto_accepted"
|
||||
eval_logger.append_eval_entry(folder, _eval_entry)
|
||||
return current_text
|
||||
|
||||
if score > best_score:
|
||||
@@ -471,9 +503,19 @@ def write_chapter(chap, bp, folder, prev_sum, tracking=None, prev_content=None,
|
||||
if attempt == max_attempts:
|
||||
if best_score >= SCORE_PASSING:
|
||||
utils.log("WRITER", f" ✅ Max attempts reached. Accepting best score ({best_score}).")
|
||||
_att["decision"] = "accepted"
|
||||
_eval_entry["attempts"].append(_att)
|
||||
_eval_entry["final_score"] = best_score
|
||||
_eval_entry["final_decision"] = "accepted"
|
||||
eval_logger.append_eval_entry(folder, _eval_entry)
|
||||
return best_text
|
||||
else:
|
||||
utils.log("WRITER", f" ⚠️ Quality low ({best_score}/{SCORE_PASSING}) but max attempts reached. Proceeding.")
|
||||
_att["decision"] = "below_threshold"
|
||||
_eval_entry["attempts"].append(_att)
|
||||
_eval_entry["final_score"] = best_score
|
||||
_eval_entry["final_decision"] = "below_threshold"
|
||||
eval_logger.append_eval_entry(folder, _eval_entry)
|
||||
return best_text
|
||||
|
||||
if score < SCORE_REWRITE_THRESHOLD:
|
||||
@@ -495,10 +537,17 @@ def write_chapter(chap, bp, folder, prev_sum, tracking=None, prev_content=None,
|
||||
utils.log_usage(folder, ai_models.model_logic.name, resp_rewrite.usage_metadata)
|
||||
current_text = resp_rewrite.text
|
||||
ai_models.model_logic.update(ai_models.logic_model_name)
|
||||
_att["decision"] = "full_rewrite"
|
||||
_eval_entry["attempts"].append(_att)
|
||||
continue
|
||||
except Exception as e:
|
||||
ai_models.model_logic.update(ai_models.logic_model_name)
|
||||
utils.log("WRITER", f"Full rewrite failed: {e}. Falling back to refinement.")
|
||||
_att["decision"] = "full_rewrite_failed"
|
||||
# fall through to refinement; decision will be overwritten below
|
||||
|
||||
else:
|
||||
_att["decision"] = "refinement"
|
||||
|
||||
utils.log("WRITER", f" -> Refining Ch {chap['chapter_number']} based on feedback...")
|
||||
|
||||
@@ -553,8 +602,21 @@ def write_chapter(chap, bp, folder, prev_sum, tracking=None, prev_content=None,
|
||||
resp_refine = ai_models.model_writer.generate_content(refine_prompt)
|
||||
utils.log_usage(folder, ai_models.model_writer.name, resp_refine.usage_metadata)
|
||||
current_text = resp_refine.text
|
||||
if _att["decision"] == "full_rewrite_failed":
|
||||
_att["decision"] = "refinement" # rewrite failed, fell back to refinement
|
||||
_eval_entry["attempts"].append(_att)
|
||||
except Exception as e:
|
||||
utils.log("WRITER", f"Refinement failed: {e}")
|
||||
_att["decision"] = "refinement_failed"
|
||||
_eval_entry["attempts"].append(_att)
|
||||
_eval_entry["final_score"] = best_score
|
||||
_eval_entry["final_decision"] = "refinement_failed"
|
||||
eval_logger.append_eval_entry(folder, _eval_entry)
|
||||
return best_text
|
||||
|
||||
# Reached only if eval_error break occurred; write log before returning.
|
||||
if _eval_entry["final_decision"] == "unknown":
|
||||
_eval_entry["final_score"] = best_score
|
||||
_eval_entry["final_decision"] = "best_available"
|
||||
eval_logger.append_eval_entry(folder, _eval_entry)
|
||||
return best_text
|
||||
|
||||
Reference in New Issue
Block a user