diff --git a/ai_blueprint.md b/ai_blueprint.md deleted file mode 100644 index 34588d2..0000000 --- a/ai_blueprint.md +++ /dev/null @@ -1,56 +0,0 @@ -# AI Blueprint: Addressing Stuck Book Generation Jobs - -> **Status: IMPLEMENTED — v2.14** -> All five steps below were implemented on 2026-02-21. - -## 1. The Problem: Progress Stalls - -The primary issue is that book generation jobs can get "stuck" in a "running" state, preventing users from starting new runs and causing confusion as the UI shows no progress. This is likely caused by worker processes crashing or encountering unhandled errors before they can update the job's final status to "completed" or "failed". - -## 2. Investigation Findings - -- **State Management:** The `Run` table in the database has a `status` column. Tasks in `web/tasks.py` are responsible for updating this from "queued" to "running" and finally to "completed" or "failed". -- **Point of Failure:** The most likely failure point is a catastrophic crash of the Huey worker process (e.g., out-of-memory error) or a deadlock within the core `cli.engine.run_generation` function. In these scenarios, the `finally` block that updates the status is never reached. -- **Database Contention:** The direct use of `sqlite3` in the tasks can lead to `database is locked` errors. While there are some retries, prolonged locks could cause status updates to fail. -- **Silent Errors:** Some task functions use a bare `try...except: pass` around the final status update. If updating the database fails, the error is swallowed, and the job remains in a "running" state. - -## 3. The Plan: Enhancing Robustness - -### Step 1: Implement a "Stale Job" Cleanup Process ✅ - -- **`last_heartbeat` column added to `Run` model** (`web/db.py`). -- **Migration** added in `web/app.py` startup to add `last_heartbeat` column to existing databases. -- **Startup reset** already present — all `status='running'` jobs are reset to `failed` at boot. -- **Periodic stale-job watcher thread** (`_stale_job_watcher`) started in `web/app.py`: - - Runs every 5 minutes. - - Marks jobs `failed` if `last_heartbeat` is > 15 minutes stale. - - Marks jobs `failed` if `start_time` is > 2 hours old and no heartbeat was ever recorded. - -### Step 2: Fortify Database Updates ✅ - -- **`_robust_update_run_status()`** helper added to `web/tasks.py`: - - 5 retries with linear backoff (1–5 seconds per attempt). - - Handles `sqlite3.OperationalError` specifically with retry; raises `RuntimeError` on total failure. -- All bare `except: pass` blocks around DB status updates removed from: - - `generate_book_task` — final status update now uses robust helper with retry. - - `regenerate_artifacts_task` — all three status-update sites fixed. - - `rewrite_chapter_task` — `db_path` moved above the outer `try` block to prevent `NameError`; all status-update sites fixed. - -### Step 3: Add Granular Logging to Core Engine ✅ - -- **`cli/engine.py` — `run_generation()`**: logs series title at start; logs start/finish of each `process_book` call; catches and re-logs exceptions before re-raising. -- **`cli/engine.py` — `process_book()`**: Added `--- Phase: X ---` banners at the start of each major stage (Blueprint, Structure & Events, Chapter Planning, Writing, Post-Processing). Each phase is wrapped in `try/except` that logs `ERROR` with the exception type before re-raising. - -### Step 4: Introduce a Task Heartbeat ✅ - -- **`core/utils.py`**: `set_heartbeat_callback()` and `send_heartbeat()` added (mirrors the existing progress/log callback pattern). -- **`web/tasks.py`**: `db_heartbeat_callback()` writes `last_heartbeat = NOW` to the DB with up to 3 retries. Set as the heartbeat callback in `generate_book_task`. -- **`cli/engine.py`**: `utils.send_heartbeat()` called after each chapter is saved to disk — the most meaningful signal that the worker is still processing. - -### Step 5: Commit and Push Changes ✅ - -Changes committed to `main` branch with message `Auto-commit: v2.14 — Stuck job robustness (heartbeat, retry, stale watcher, granular logging)`. - ---- - -This multi-layered approach will significantly reduce the chances of jobs getting stuck and provide better diagnostics if they do. It ensures the system can recover gracefully from worker failures and database locks. diff --git a/templates/project.html b/templates/project.html index ea5fe18..359f88d 100644 --- a/templates/project.html +++ b/templates/project.html @@ -19,17 +19,17 @@
-
- {% if runs and runs[0].status in ['running', 'queued'] %} -
-
- {% endif %} + {% endfor %}
@@ -46,6 +46,36 @@ + +{% if active_runs %} +
+
+
Active Jobs ({{ active_runs|length }})
+
+
+
+ {% for ar in active_runs %} +
+ {{ ar.status|upper }} +
+ Run #{{ ar.id }} + Started: {{ ar.start_time.strftime('%Y-%m-%d %H:%M') if ar.start_time else 'Pending' }} + {% if ar.progress %} +
+
+
+ {% endif %} +
+ + View Details + +
+ {% endfor %} +
+
+
+{% endif %} +
diff --git a/web/app.py b/web/app.py index 90bdfeb..1039b20 100644 --- a/web/app.py +++ b/web/app.py @@ -116,27 +116,20 @@ with app.app_context(): _log("System: Added 'last_heartbeat' column to Run table.") except: pass - # Reset stuck runs on startup + # Reset all non-terminal runs on startup (running, queued, interrupted) + # The Huey consumer restarts with the app, so any in-flight tasks are gone. try: - stuck_runs = Run.query.filter_by(status='running').all() - if stuck_runs: - _log(f"System: Found {len(stuck_runs)} stuck run(s) — resetting to 'failed'.") - for r in stuck_runs: + _NON_TERMINAL = ['running', 'queued', 'interrupted'] + non_terminal = Run.query.filter(Run.status.in_(_NON_TERMINAL)).all() + if non_terminal: + _log(f"System: Resetting {len(non_terminal)} non-terminal run(s) to 'failed' on startup:") + for r in non_terminal: + _log(f" - Run #{r.id} was '{r.status}' — now 'failed'.") r.status = 'failed' r.end_time = datetime.utcnow() db.session.commit() - # Also reset stuck 'queued' runs whose task entry was lost from queue.db - import sqlite3 as _sqlite3 - _queue_path = os.path.join(config.DATA_DIR, 'queue.db') - if os.path.exists(_queue_path): - with _sqlite3.connect(_queue_path, timeout=5) as _qconn: - pending_count = _qconn.execute("SELECT COUNT(*) FROM task").fetchone()[0] - queued_runs = Run.query.filter_by(status='queued').count() - _log(f"System: Queue has {pending_count} pending task(s), DB has {queued_runs} queued run(s).") - if queued_runs > 0 and pending_count == 0: - _log("System: WARNING — queued runs exist but queue is empty (tasks lost). Resetting to 'failed'.") - Run.query.filter_by(status='queued').update({'status': 'failed', 'end_time': datetime.utcnow()}) - db.session.commit() + else: + _log("System: No non-terminal runs found. Clean startup.") except Exception as e: _log(f"System: Startup cleanup error: {e}") diff --git a/web/routes/project.py b/web/routes/project.py index c8a073d..e4c6738 100644 --- a/web/routes/project.py +++ b/web/routes/project.py @@ -337,6 +337,7 @@ def view_project(id): runs = Run.query.filter_by(project_id=id).order_by(Run.id.desc()).all() latest_run = runs[0] if runs else None + active_runs = [r for r in runs if r.status in ('running', 'queued')] other_projects = Project.query.filter(Project.user_id == current_user.id, Project.id != id).all() @@ -385,7 +386,7 @@ def view_project(id): 'type': f.split('.')[-1].upper() }) - return render_template('project.html', project=proj, bible=bible_data, runs=runs, active_run=latest_run, artifacts=artifacts, cover_image=cover_image, personas=personas, generated_books=generated_books, other_projects=other_projects, locked=locked, has_draft=has_draft, is_refining=is_refining) + return render_template('project.html', project=proj, bible=bible_data, runs=runs, active_run=latest_run, active_runs=active_runs, artifacts=artifacts, cover_image=cover_image, personas=personas, generated_books=generated_books, other_projects=other_projects, locked=locked, has_draft=has_draft, is_refining=is_refining) @project_bp.route('/project//run', methods=['POST'])