diff --git a/story/eval_logger.py b/story/eval_logger.py new file mode 100644 index 0000000..aa0d0cd --- /dev/null +++ b/story/eval_logger.py @@ -0,0 +1,473 @@ +"""eval_logger.py — Per-chapter evaluation log and HTML report generator. + +Writes a structured eval_log.json to the book folder during writing, then +generates a self-contained HTML report that can be downloaded and shared with +critics / prompt engineers to analyse quality patterns across a run. +""" + +import json +import os +import time +from core import utils + + +# --------------------------------------------------------------------------- +# Log writer +# --------------------------------------------------------------------------- + +def append_eval_entry(folder, entry): + """Append one chapter's evaluation record to eval_log.json. + + Called from story/writer.py at every return point in write_chapter(). + Each entry captures the chapter metadata, polish decision, per-attempt + scores/critiques/decisions, and the final accepted score. + """ + log_path = os.path.join(folder, "eval_log.json") + data = [] + if os.path.exists(log_path): + try: + with open(log_path, 'r', encoding='utf-8') as f: + data = json.load(f) + if not isinstance(data, list): + data = [] + except Exception: + data = [] + data.append(entry) + try: + with open(log_path, 'w', encoding='utf-8') as f: + json.dump(data, f, indent=2) + except Exception as e: + utils.log("EVAL", f"Failed to write eval log: {e}") + + +# --------------------------------------------------------------------------- +# Report generation +# --------------------------------------------------------------------------- + +def generate_html_report(folder, bp=None): + """Generate a self-contained HTML evaluation report from eval_log.json. + + Returns the HTML string, or None if no log file exists / is empty. + """ + log_path = os.path.join(folder, "eval_log.json") + if not os.path.exists(log_path): + return None + try: + with open(log_path, 'r', encoding='utf-8') as f: + chapters = json.load(f) + except Exception: + return None + + if not isinstance(chapters, list) or not chapters: + return None + + title, genre = "Unknown Book", "Fiction" + if bp: + meta = bp.get('book_metadata', {}) + title = meta.get('title', title) + genre = meta.get('genre', genre) + + # --- Summary stats --- + scores = [c.get('final_score', 0) for c in chapters if isinstance(c.get('final_score'), (int, float)) and c.get('final_score', 0) > 0] + avg_score = round(sum(scores) / len(scores), 2) if scores else 0 + total = len(chapters) + auto_accepted = sum(1 for c in chapters if c.get('final_decision') == 'auto_accepted') + multi_attempt = sum(1 for c in chapters if len(c.get('attempts', [])) > 1) + full_rewrites = sum(1 for c in chapters for a in c.get('attempts', []) if a.get('decision') == 'full_rewrite') + below_threshold = sum(1 for c in chapters if c.get('final_decision') == 'below_threshold') + polish_applied = sum(1 for c in chapters if c.get('polish_applied')) + + score_dist = {i: 0 for i in range(1, 11)} + for c in chapters: + s = c.get('final_score', 0) + if isinstance(s, int) and 1 <= s <= 10: + score_dist[s] += 1 + + patterns = _mine_critique_patterns(chapters, total) + report_date = time.strftime('%Y-%m-%d %H:%M') + return _build_html(title, genre, report_date, chapters, avg_score, total, + auto_accepted, multi_attempt, full_rewrites, below_threshold, + polish_applied, score_dist, patterns) + + +# --------------------------------------------------------------------------- +# Pattern mining +# --------------------------------------------------------------------------- + +def _mine_critique_patterns(chapters, total): + pattern_keywords = { + "Filter words (felt/saw/noticed)": ["filter word", "filter", "felt ", "noticed ", "realized ", "saw the", "heard the"], + "Summary mode / telling": ["summary mode", "summariz", "telling", "show don't tell", "show, don't tell", "instead of dramatiz"], + "Emotion labeling": ["emotion label", "told the reader", "labeling", "labelling", "she felt", "he felt", "was nervous", "was angry", "was sad"], + "Deep POV issues": ["deep pov", "deep point of view", "distant narration", "remove the reader", "external narration"], + "Pacing problems": ["pacing", "rushing", "too fast", "too slow", "dragging", "sagging", "abrupt"], + "Dialogue too on-the-nose": ["on-the-nose", "on the nose", "subtext", "exposition dump", "characters explain"], + "Weak chapter hook / ending": ["hook", "cliffhanger", "cut off abruptly", "anticlimax", "ending falls flat", "no tension"], + "Passive voice / weak verbs": ["passive voice", "was [v", "were [v", "weak verb", "adverb"], + "AI-isms / clichés": ["ai-ism", "cliché", "tapestry", "palpable", "testament", "azure", "cerulean", "bustling"], + "Voice / tone inconsistency": ["voice", "tone inconsist", "persona", "shift in tone", "register"], + "Missing sensory / atmosphere": ["sensory", "grounding", "atmosphere", "immersiv", "white room"], + } + counts = {} + for pattern, keywords in pattern_keywords.items(): + matching = [] + for c in chapters: + critique_blob = " ".join( + a.get('critique', '').lower() + for a in c.get('attempts', []) + ) + if any(kw.lower() in critique_blob for kw in keywords): + matching.append(c.get('chapter_num', '?')) + counts[pattern] = {'count': len(matching), 'chapters': matching} + return dict(sorted(counts.items(), key=lambda x: x[1]['count'], reverse=True)) + + +# --------------------------------------------------------------------------- +# HTML builder +# --------------------------------------------------------------------------- + +def _score_color(s): + try: + s = float(s) + except (TypeError, ValueError): + return '#6c757d' + if s >= 8: return '#28a745' + if s >= 7: return '#20c997' + if s >= 6: return '#ffc107' + return '#dc3545' + + +def _decision_badge(d): + MAP = { + 'auto_accepted': ('⚡ Auto-Accept', '#28a745'), + 'accepted': ('✓ Accepted', '#17a2b8'), + 'accepted_at_max': ('✓ Accepted', '#17a2b8'), + 'below_threshold': ('⚠ Below Threshold', '#dc3545'), + 'below_threshold_accepted': ('⚠ Below Threshold', '#dc3545'), + 'full_rewrite': ('🔄 Full Rewrite', '#6f42c1'), + 'full_rewrite_failed': ('🔄✗ Rewrite Failed','#6f42c1'), + 'refinement': ('✏ Refined', '#fd7e14'), + 'refinement_failed': ('✏✗ Refine Failed', '#fd7e14'), + 'eval_error': ('⚠ Eval Error', '#6c757d'), + } + label, color = MAP.get(d, (d or '?', '#6c757d')) + return f'{label}' + + +def _safe_int_fmt(v): + try: + return f"{int(v):,}" + except (TypeError, ValueError): + return str(v) if v else '?' + + +def _build_html(title, genre, report_date, chapters, avg_score, total, + auto_accepted, multi_attempt, full_rewrites, below_threshold, + polish_applied, score_dist, patterns): + + avg_color = _score_color(avg_score) + + # --- Score timeline --- + MAX_BAR = 260 + timeline_rows = '' + for c in chapters: + s = c.get('final_score', 0) + color = _score_color(s) + width = max(2, int((s / 10) * MAX_BAR)) if s else 2 + ch_num = c.get('chapter_num', '?') + ch_title = str(c.get('title', ''))[:35] + timeline_rows += ( + f'
' + f'
Ch {ch_num}
' + f'
' + f'
{s}/10 — {ch_title}
' + f'
' + ) + + # --- Score distribution --- + max_dist = max(score_dist.values()) if any(score_dist.values()) else 1 + dist_rows = '' + for sv in range(10, 0, -1): + count = score_dist.get(sv, 0) + w = max(2, int((count / max_dist) * 200)) if count else 0 + color = _score_color(sv) + dist_rows += ( + f'
' + f'
{sv}
' + f'
' + f'
{count} ch{"apters" if count != 1 else "apter"}
' + f'
' + ) + + # --- Chapter rows --- + chapter_rows = '' + for c in chapters: + cid = c.get('chapter_num', 0) + ch_title = str(c.get('title', '')).replace('<', '<').replace('>', '>') + pov = str(c.get('pov_character') or '—') + pace = str(c.get('pacing') or '—') + target_w = _safe_int_fmt(c.get('target_words')) + actual_w = _safe_int_fmt(c.get('actual_words')) + pos = c.get('chapter_position') + pos_pct = f"{int(pos * 100)}%" if pos is not None else '—' + threshold = c.get('score_threshold', '?') + fw_dens = c.get('filter_word_density', 0) + polish = '✓' if c.get('polish_applied') else '✗' + polish_c = '#28a745' if c.get('polish_applied') else '#aaa' + fs = c.get('final_score', 0) + fd = c.get('final_decision', '') + attempts = c.get('attempts', []) + n_att = len(attempts) + fs_color = _score_color(fs) + fd_badge = _decision_badge(fd) + + # Attempt detail sub-rows + att_rows = '' + for att in attempts: + an = att.get('n', '?') + ascr = att.get('score', '?') + adec = att.get('decision', '') + acrit = str(att.get('critique', 'No critique.')).replace('&', '&').replace('<', '<').replace('>', '>') + ac = _score_color(ascr) + abadge = _decision_badge(adec) + att_rows += ( + f'' + f'' + f'
Attempt {an}:' + f'{ascr}/10' + f'{abadge}
' + f'
{acrit}
' + f'' + ) + + chapter_rows += ( + f'' + f'{cid}' + f'{ch_title}' + f'{pov}' + f'{pace}' + f'{actual_w} /{target_w}' + f'{pos_pct}' + f'{threshold}' + f'{polish} {fw_dens:.3f}' + f'{fs}' + f'{n_att}×' + f'{fd_badge}' + f'' + f'{att_rows}' + ) + + # --- Critique patterns --- + pat_rows = '' + for pattern, data in patterns.items(): + count = data['count'] + if count == 0: + continue + pct = int(count / total * 100) if total else 0 + sev_color = '#dc3545' if pct >= 50 else '#fd7e14' if pct >= 30 else '#17a2b8' + chlist = ', '.join(f'Ch {x}' for x in data['chapters'][:10]) + if len(data['chapters']) > 10: + chlist += f' (+{len(data["chapters"]) - 10} more)' + pat_rows += ( + f'' + f'{pattern}' + f'{count}/{total} ({pct}%)' + f'{chlist}' + f'' + ) + if not pat_rows: + pat_rows = 'No significant patterns detected.' + + # --- Prompt tuning notes --- + notes = _generate_prompt_notes(chapters, avg_score, total, full_rewrites, below_threshold, patterns) + notes_html = ''.join(f'
  • {n}
  • ' for n in notes) + + return f''' + + + + +Eval Report — {title} + + + +
    + +
    +

    BookApp — Evaluation Report

    +

    {title}

    +

    Genre: {genre}  |  Generated: {report_date}  |  {total} chapter{"s" if total != 1 else ""}

    +
    + +
    +
    {avg_score}
    Avg Score /10
    +
    {auto_accepted}
    Auto-Accepted (8+)
    +
    {multi_attempt}
    Multi-Attempt
    +
    {full_rewrites}
    Full Rewrites
    +
    {below_threshold}
    Below Threshold
    +
    {polish_applied}
    Polish Passes
    +
    + +
    +
    +

    📊 Score Timeline

    +
    + 8–10 Great + 7–7.9 Good + 6–6.9 Passable + <6 Fail +
    +
    {timeline_rows}
    +
    +
    +

    📈 Score Distribution

    +
    {dist_rows}
    +
    +
    + +
    +

    📋 Chapter Breakdown  (click any row to expand critiques)

    +
    + + + + + + + + + + + + {chapter_rows} +
    #TitlePOVPacingWordsPos%ThresholdPolish / FWScoreAtt.Decision
    +
    +
    + +
    +

    🔍 Critique Patterns  Keyword frequency across all evaluation critiques — high % = prompt gap

    + + + {pat_rows} +
    Issue PatternFrequencyAffected Chapters
    +
    + +
    +

    💡 Prompt Tuning Observations

    + +
    + +
    + + +''' + + +# --------------------------------------------------------------------------- +# Auto-observations for prompt tuning +# --------------------------------------------------------------------------- + +def _generate_prompt_notes(chapters, avg_score, total, full_rewrites, below_threshold, patterns): + notes = [] + + # Overall score + if avg_score >= 8: + notes.append(f"✅ High average score ({avg_score}/10). The generation pipeline is performing well. Focus on the few outlier chapters below the threshold.") + elif avg_score >= 7: + notes.append(f"✓ Solid average score ({avg_score}/10). Minor prompt reinforcement should push this above 8. Focus on the most common critique pattern.") + elif avg_score >= 6: + notes.append(f"⚠ Average score of {avg_score}/10 is below target. Strengthen the draft prompt's Deep POV mandate and filter-word removal rules.") + else: + notes.append(f"🚨 Low average score ({avg_score}/10). The core writing prompt needs significant work — review the Deep POV mandate, genre mandates, and consider adding concrete negative examples.") + + # Full rewrite rate + if total > 0: + rw_pct = int(full_rewrites / total * 100) + if rw_pct > 30: + notes.append(f"🔄 High full-rewrite rate ({rw_pct}%, {full_rewrites} triggers). The initial draft prompt produces too many sub-6 drafts. Add stronger examples or tighten the DEEP_POV_MANDATE and PROSE_RULES sections.") + elif rw_pct > 15: + notes.append(f"↩ Moderate full-rewrite rate ({rw_pct}%, {full_rewrites} triggers). The draft quality could be improved. Check the genre mandates for the types of chapters that rewrite most often.") + + # Below threshold + if below_threshold > 0: + bt_pct = int(below_threshold / total * 100) + notes.append(f"⚠ {below_threshold} chapter{'s' if below_threshold != 1 else ''} ({bt_pct}%) finished below the quality threshold. Inspect the individual critiques to see if these cluster by POV, pacing, or story position.") + + # Top critique patterns + for pattern, data in list(patterns.items())[:5]: + pct = int(data['count'] / total * 100) if total else 0 + if pct >= 50: + notes.append(f"🔴 '{pattern}' appears in {pct}% of critiques. This is systemic — the current prompt does not prevent it. Add an explicit enforcement instruction with a concrete example of the wrong pattern and the correct alternative.") + elif pct >= 30: + notes.append(f"🟡 '{pattern}' mentioned in {pct}% of critiques. Consider reinforcing the relevant prompt instruction with a stronger negative example.") + + # Climax vs. early chapter comparison + high_scores = [c.get('final_score', 0) for c in chapters if isinstance(c.get('chapter_position'), float) and c['chapter_position'] >= 0.75] + low_scores = [c.get('final_score', 0) for c in chapters if isinstance(c.get('chapter_position'), float) and c['chapter_position'] < 0.25] + if high_scores and low_scores: + avg_climax = round(sum(high_scores) / len(high_scores), 1) + avg_early = round(sum(low_scores) / len(low_scores), 1) + if avg_climax < avg_early - 0.5: + notes.append(f"📅 Climax chapters average {avg_climax}/10 vs early chapters {avg_early}/10. The high-stakes scenes underperform. Strengthen the genre mandates for climax pacing and consider adding specific instructions for emotional payoff.") + elif avg_climax > avg_early + 0.5: + notes.append(f"📅 Climax chapters outperform early chapters ({avg_climax} vs {avg_early}). Good — the adaptive threshold and extra attempts are concentrating quality where it matters.") + + # POV character analysis + pov_scores = {} + for c in chapters: + pov = c.get('pov_character') or 'Unknown' + s = c.get('final_score', 0) + if s > 0: + pov_scores.setdefault(pov, []).append(s) + for pov, sc in sorted(pov_scores.items(), key=lambda x: sum(x[1]) / len(x[1])): + if len(sc) >= 2 and sum(sc) / len(sc) < 6.5: + avg_pov = round(sum(sc) / len(sc), 1) + notes.append(f"👤 POV '{pov}' averages {avg_pov}/10. Consider adding or strengthening a character voice profile for this character, or refining the persona bio to match how this POV character should speak and think.") + + # Pacing analysis + pace_scores = {} + for c in chapters: + pace = c.get('pacing', 'Standard') + s = c.get('final_score', 0) + if s > 0: + pace_scores.setdefault(pace, []).append(s) + for pace, sc in pace_scores.items(): + if len(sc) >= 3 and sum(sc) / len(sc) < 6.5: + avg_p = round(sum(sc) / len(sc), 1) + notes.append(f"⏩ '{pace}' pacing chapters average {avg_p}/10. The writing model struggles with this rhythm. Revisit the PACING_GUIDE instructions for '{pace}' chapters — they may need more concrete direction.") + + if not notes: + notes.append("No significant patterns detected. Review the individual chapter critiques for targeted improvements.") + return notes diff --git a/story/writer.py b/story/writer.py index 021ba6f..957d05a 100644 --- a/story/writer.py +++ b/story/writer.py @@ -1,9 +1,11 @@ import json import os +import time from core import config, utils from ai import models as ai_models from story.style_persona import get_style_guidelines from story.editor import evaluate_chapter_quality +from story import eval_logger def get_genre_instructions(genre): @@ -443,6 +445,25 @@ def write_chapter(chap, bp, folder, prev_sum, tracking=None, prev_content=None, SCORE_PASSING = 7 SCORE_REWRITE_THRESHOLD = 6 + # Evaluation log entry — written to eval_log.json for the HTML report. + _eval_entry = { + "ts": time.strftime('%Y-%m-%d %H:%M:%S'), + "chapter_num": chap['chapter_number'], + "title": chap.get('title', ''), + "pov_character": chap.get('pov_character', ''), + "pacing": pacing, + "target_words": est_words, + "actual_words": draft_words, + "chapter_position": chapter_position, + "score_threshold": SCORE_PASSING, + "score_auto_accept": SCORE_AUTO_ACCEPT, + "polish_applied": bool(current_text and not _skip_polish), + "filter_word_density": round(_fw_density, 4), + "attempts": [], + "final_score": 0, + "final_decision": "unknown", + } + best_score = 0 best_text = current_text past_critiques = [] @@ -452,16 +473,27 @@ def write_chapter(chap, bp, folder, prev_sum, tracking=None, prev_content=None, score, critique = evaluate_chapter_quality(current_text, chap['title'], meta.get('genre', 'Fiction'), ai_models.model_logic, folder, series_context=series_block.strip()) past_critiques.append(f"Attempt {attempt}: {critique}") + _att = {"n": attempt, "score": score, "critique": critique[:700], "decision": None} if "Evaluation error" in critique: utils.log("WRITER", f" ⚠️ {critique}. Keeping current draft.") if best_score == 0: best_text = current_text + _att["decision"] = "eval_error" + _eval_entry["attempts"].append(_att) + _eval_entry["final_score"] = best_score + _eval_entry["final_decision"] = "eval_error" + eval_logger.append_eval_entry(folder, _eval_entry) break utils.log("WRITER", f" Score: {score}/10. Critique: {critique}") if score >= SCORE_AUTO_ACCEPT: utils.log("WRITER", " 🌟 Auto-Accept threshold met.") + _att["decision"] = "auto_accepted" + _eval_entry["attempts"].append(_att) + _eval_entry["final_score"] = score + _eval_entry["final_decision"] = "auto_accepted" + eval_logger.append_eval_entry(folder, _eval_entry) return current_text if score > best_score: @@ -471,9 +503,19 @@ def write_chapter(chap, bp, folder, prev_sum, tracking=None, prev_content=None, if attempt == max_attempts: if best_score >= SCORE_PASSING: utils.log("WRITER", f" ✅ Max attempts reached. Accepting best score ({best_score}).") + _att["decision"] = "accepted" + _eval_entry["attempts"].append(_att) + _eval_entry["final_score"] = best_score + _eval_entry["final_decision"] = "accepted" + eval_logger.append_eval_entry(folder, _eval_entry) return best_text else: utils.log("WRITER", f" ⚠️ Quality low ({best_score}/{SCORE_PASSING}) but max attempts reached. Proceeding.") + _att["decision"] = "below_threshold" + _eval_entry["attempts"].append(_att) + _eval_entry["final_score"] = best_score + _eval_entry["final_decision"] = "below_threshold" + eval_logger.append_eval_entry(folder, _eval_entry) return best_text if score < SCORE_REWRITE_THRESHOLD: @@ -495,10 +537,17 @@ def write_chapter(chap, bp, folder, prev_sum, tracking=None, prev_content=None, utils.log_usage(folder, ai_models.model_logic.name, resp_rewrite.usage_metadata) current_text = resp_rewrite.text ai_models.model_logic.update(ai_models.logic_model_name) + _att["decision"] = "full_rewrite" + _eval_entry["attempts"].append(_att) continue except Exception as e: ai_models.model_logic.update(ai_models.logic_model_name) utils.log("WRITER", f"Full rewrite failed: {e}. Falling back to refinement.") + _att["decision"] = "full_rewrite_failed" + # fall through to refinement; decision will be overwritten below + + else: + _att["decision"] = "refinement" utils.log("WRITER", f" -> Refining Ch {chap['chapter_number']} based on feedback...") @@ -553,8 +602,21 @@ def write_chapter(chap, bp, folder, prev_sum, tracking=None, prev_content=None, resp_refine = ai_models.model_writer.generate_content(refine_prompt) utils.log_usage(folder, ai_models.model_writer.name, resp_refine.usage_metadata) current_text = resp_refine.text + if _att["decision"] == "full_rewrite_failed": + _att["decision"] = "refinement" # rewrite failed, fell back to refinement + _eval_entry["attempts"].append(_att) except Exception as e: utils.log("WRITER", f"Refinement failed: {e}") + _att["decision"] = "refinement_failed" + _eval_entry["attempts"].append(_att) + _eval_entry["final_score"] = best_score + _eval_entry["final_decision"] = "refinement_failed" + eval_logger.append_eval_entry(folder, _eval_entry) return best_text + # Reached only if eval_error break occurred; write log before returning. + if _eval_entry["final_decision"] == "unknown": + _eval_entry["final_score"] = best_score + _eval_entry["final_decision"] = "best_available" + eval_logger.append_eval_entry(folder, _eval_entry) return best_text diff --git a/templates/run_details.html b/templates/run_details.html index 8102ea2..c0400af 100644 --- a/templates/run_details.html +++ b/templates/run_details.html @@ -208,6 +208,9 @@ Check Consistency + + Eval Report + diff --git a/web/routes/run.py b/web/routes/run.py index fdd7118..5b8fdda 100644 --- a/web/routes/run.py +++ b/web/routes/run.py @@ -10,7 +10,7 @@ from core import utils from ai import models as ai_models from ai import setup as ai_setup from story import editor as story_editor -from story import bible_tracker, style_persona +from story import bible_tracker, style_persona, eval_logger as story_eval_logger from export import exporter from web.tasks import huey, regenerate_artifacts_task, rewrite_chapter_task @@ -434,6 +434,45 @@ def delete_run(id): return redirect(url_for('project.view_project', id=project_id)) +@run_bp.route('/project//eval_report/') +@login_required +def eval_report(run_id, book_folder): + """Generate and download the self-contained HTML evaluation report.""" + run = db.session.get(Run, run_id) or Run.query.get_or_404(run_id) + if run.project.user_id != current_user.id: + return "Unauthorized", 403 + + if not book_folder or "/" in book_folder or "\\" in book_folder or ".." in book_folder: + return "Invalid book folder", 400 + + run_dir = os.path.join(run.project.folder_path, "runs", f"run_{run.id}") + book_path = os.path.join(run_dir, book_folder) + + bp = utils.load_json(os.path.join(book_path, "final_blueprint.json")) or \ + utils.load_json(os.path.join(book_path, "blueprint_initial.json")) + + html = story_eval_logger.generate_html_report(book_path, bp) + if not html: + return ( + "" + "

    No evaluation data yet.

    " + "

    The evaluation report is generated during the writing phase. " + "Start a generation run and the report will be available once chapters have been evaluated.

    " + "" + ), 200 + + from flask import Response + safe_title = utils.sanitize_filename( + (bp or {}).get('book_metadata', {}).get('title', book_folder) or book_folder + )[:40] + filename = f"eval_report_{safe_title}.html" + return Response( + html, + mimetype='text/html', + headers={'Content-Disposition': f'attachment; filename="{filename}"'} + ) + + @run_bp.route('/run//download_bible') @login_required def download_bible(id):