Auto-commit: v2.14 — Stuck job robustness (heartbeat, retry, stale watcher, granular logging)

- web/db.py: Add last_heartbeat column to Run model - core/utils.py: Add set_heartbeat_callback() and send_heartbeat() - web/tasks.py: Add _robust_update_run_status() with 5-retry exponential backoff; add db_heartbeat_callback(); remove all bare except:pass on DB status updates; set start_time + last_heartbeat when marking run as 'running' - web/app.py: Add last_heartbeat column migration; add _stale_job_watcher() background thread (checks every 5 min, 15-min heartbeat threshold, 2-hr start_time threshold) - cli/engine.py: Add phase-level logging banners and try/except wrappers in process_book(); add utils.send_heartbeat() after each chapter save; add start/finish logging in run_generation() Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-02-21 19:00:29 -05:00
parent 97efd51fd5
commit 81340a18ea
6 changed files with 275 additions and 122 deletions
--- a/web/app.py
+++ b/web/app.py
@@ -108,6 +108,14 @@ with app.app_context():
            _log("System: Added 'progress' column to Run table.")
    except: pass

+    # Migration: Add 'last_heartbeat' column if missing
+    try:
+        with db.engine.connect() as conn:
+            conn.execute(text("ALTER TABLE run ADD COLUMN last_heartbeat DATETIME"))
+            conn.commit()
+            _log("System: Added 'last_heartbeat' column to Run table.")
+    except: pass
+
    # Reset stuck runs on startup
    try:
        stuck_runs = Run.query.filter_by(status='running').all()
@@ -133,6 +141,42 @@ with app.app_context():
        _log(f"System: Startup cleanup error: {e}")


+# --- STALE JOB WATCHER ---
+# Background thread that periodically detects jobs where the heartbeat has
+# gone silent (>15 min) or the total run has exceeded 2 hours.
+
+def _stale_job_watcher():
+    import time as _time
+    from datetime import datetime as _dt, timedelta as _td
+
+    _HEARTBEAT_THRESHOLD = _td(minutes=15)
+    _MAX_RUN_THRESHOLD = _td(hours=2)
+    _CHECK_INTERVAL = 5 * 60  # seconds
+
+    while True:
+        _time.sleep(_CHECK_INTERVAL)
+        try:
+            with app.app_context():
+                now = _dt.utcnow()
+                stale = Run.query.filter_by(status='running').all()
+                for r in stale:
+                    # Check heartbeat first (shorter threshold)
+                    if r.last_heartbeat and (now - r.last_heartbeat) > _HEARTBEAT_THRESHOLD:
+                        _log(f"System: [StaleWatcher] Run #{r.id} heartbeat is {now - r.last_heartbeat} old — marking failed.")
+                        r.status = 'failed'
+                        r.end_time = now
+                        db.session.add(r)
+                    # Fallback: check start_time if no heartbeat recorded
+                    elif not r.last_heartbeat and r.start_time and (now - r.start_time) > _MAX_RUN_THRESHOLD:
+                        _log(f"System: [StaleWatcher] Run #{r.id} running {now - r.start_time} with no heartbeat — marking failed.")
+                        r.status = 'failed'
+                        r.end_time = now
+                        db.session.add(r)
+                db.session.commit()
+        except Exception as _e:
+            _log(f"System: [StaleWatcher] Error during stale-job check: {_e}")
+
+
 # --- HUEY CONSUMER ---
 # Start the Huey task consumer in a background thread whenever the app loads.
 # Guard against the Werkzeug reloader spawning a second consumer in the child process,
@@ -173,6 +217,9 @@ if not _is_reloader_child and not _is_testing:
    _log("System: Launching Huey consumer thread...")
    _huey_thread = _threading.Thread(target=_start_huey_consumer, daemon=True, name="huey-consumer")
    _huey_thread.start()
+    _log("System: Launching stale-job watcher thread (checks every 5 min)...")
+    _watcher_thread = _threading.Thread(target=_stale_job_watcher, daemon=True, name="stale-job-watcher")
+    _watcher_thread.start()
 else:
    _log(f"System: Skipping Huey consumer (WERKZEUG_RUN_MAIN={os.environ.get('WERKZEUG_RUN_MAIN')}, FLASK_TESTING={os.environ.get('FLASK_TESTING')}).")