Auto-commit: v2.14 — Stuck job robustness (heartbeat, retry, stale watcher, granular logging)

- web/db.py: Add last_heartbeat column to Run model
- core/utils.py: Add set_heartbeat_callback() and send_heartbeat()
- web/tasks.py: Add _robust_update_run_status() with 5-retry exponential backoff;
  add db_heartbeat_callback(); remove all bare except:pass on DB status updates;
  set start_time + last_heartbeat when marking run as 'running'
- web/app.py: Add last_heartbeat column migration; add _stale_job_watcher()
  background thread (checks every 5 min, 15-min heartbeat threshold, 2-hr start_time threshold)
- cli/engine.py: Add phase-level logging banners and try/except wrappers in
  process_book(); add utils.send_heartbeat() after each chapter save;
  add start/finish logging in run_generation()

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-02-21 19:00:29 -05:00
parent 97efd51fd5
commit 81340a18ea
6 changed files with 275 additions and 122 deletions

View File

@@ -108,6 +108,14 @@ with app.app_context():
_log("System: Added 'progress' column to Run table.")
except: pass
# Migration: Add 'last_heartbeat' column if missing
try:
with db.engine.connect() as conn:
conn.execute(text("ALTER TABLE run ADD COLUMN last_heartbeat DATETIME"))
conn.commit()
_log("System: Added 'last_heartbeat' column to Run table.")
except: pass
# Reset stuck runs on startup
try:
stuck_runs = Run.query.filter_by(status='running').all()
@@ -133,6 +141,42 @@ with app.app_context():
_log(f"System: Startup cleanup error: {e}")
# --- STALE JOB WATCHER ---
# Background thread that periodically detects jobs where the heartbeat has
# gone silent (>15 min) or the total run has exceeded 2 hours.
def _stale_job_watcher():
import time as _time
from datetime import datetime as _dt, timedelta as _td
_HEARTBEAT_THRESHOLD = _td(minutes=15)
_MAX_RUN_THRESHOLD = _td(hours=2)
_CHECK_INTERVAL = 5 * 60 # seconds
while True:
_time.sleep(_CHECK_INTERVAL)
try:
with app.app_context():
now = _dt.utcnow()
stale = Run.query.filter_by(status='running').all()
for r in stale:
# Check heartbeat first (shorter threshold)
if r.last_heartbeat and (now - r.last_heartbeat) > _HEARTBEAT_THRESHOLD:
_log(f"System: [StaleWatcher] Run #{r.id} heartbeat is {now - r.last_heartbeat} old — marking failed.")
r.status = 'failed'
r.end_time = now
db.session.add(r)
# Fallback: check start_time if no heartbeat recorded
elif not r.last_heartbeat and r.start_time and (now - r.start_time) > _MAX_RUN_THRESHOLD:
_log(f"System: [StaleWatcher] Run #{r.id} running {now - r.start_time} with no heartbeat — marking failed.")
r.status = 'failed'
r.end_time = now
db.session.add(r)
db.session.commit()
except Exception as _e:
_log(f"System: [StaleWatcher] Error during stale-job check: {_e}")
# --- HUEY CONSUMER ---
# Start the Huey task consumer in a background thread whenever the app loads.
# Guard against the Werkzeug reloader spawning a second consumer in the child process,
@@ -173,6 +217,9 @@ if not _is_reloader_child and not _is_testing:
_log("System: Launching Huey consumer thread...")
_huey_thread = _threading.Thread(target=_start_huey_consumer, daemon=True, name="huey-consumer")
_huey_thread.start()
_log("System: Launching stale-job watcher thread (checks every 5 min)...")
_watcher_thread = _threading.Thread(target=_stale_job_watcher, daemon=True, name="stale-job-watcher")
_watcher_thread.start()
else:
_log(f"System: Skipping Huey consumer (WERKZEUG_RUN_MAIN={os.environ.get('WERKZEUG_RUN_MAIN')}, FLASK_TESTING={os.environ.get('FLASK_TESTING')}).")