Auto-commit: v2.14 — Stuck job robustness (heartbeat, retry, stale watcher, granular logging)
- web/db.py: Add last_heartbeat column to Run model - core/utils.py: Add set_heartbeat_callback() and send_heartbeat() - web/tasks.py: Add _robust_update_run_status() with 5-retry exponential backoff; add db_heartbeat_callback(); remove all bare except:pass on DB status updates; set start_time + last_heartbeat when marking run as 'running' - web/app.py: Add last_heartbeat column migration; add _stale_job_watcher() background thread (checks every 5 min, 15-min heartbeat threshold, 2-hr start_time threshold) - cli/engine.py: Add phase-level logging banners and try/except wrappers in process_book(); add utils.send_heartbeat() after each chapter save; add start/finish logging in run_generation() Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
47
web/app.py
47
web/app.py
@@ -108,6 +108,14 @@ with app.app_context():
|
||||
_log("System: Added 'progress' column to Run table.")
|
||||
except: pass
|
||||
|
||||
# Migration: Add 'last_heartbeat' column if missing
|
||||
try:
|
||||
with db.engine.connect() as conn:
|
||||
conn.execute(text("ALTER TABLE run ADD COLUMN last_heartbeat DATETIME"))
|
||||
conn.commit()
|
||||
_log("System: Added 'last_heartbeat' column to Run table.")
|
||||
except: pass
|
||||
|
||||
# Reset stuck runs on startup
|
||||
try:
|
||||
stuck_runs = Run.query.filter_by(status='running').all()
|
||||
@@ -133,6 +141,42 @@ with app.app_context():
|
||||
_log(f"System: Startup cleanup error: {e}")
|
||||
|
||||
|
||||
# --- STALE JOB WATCHER ---
|
||||
# Background thread that periodically detects jobs where the heartbeat has
|
||||
# gone silent (>15 min) or the total run has exceeded 2 hours.
|
||||
|
||||
def _stale_job_watcher():
|
||||
import time as _time
|
||||
from datetime import datetime as _dt, timedelta as _td
|
||||
|
||||
_HEARTBEAT_THRESHOLD = _td(minutes=15)
|
||||
_MAX_RUN_THRESHOLD = _td(hours=2)
|
||||
_CHECK_INTERVAL = 5 * 60 # seconds
|
||||
|
||||
while True:
|
||||
_time.sleep(_CHECK_INTERVAL)
|
||||
try:
|
||||
with app.app_context():
|
||||
now = _dt.utcnow()
|
||||
stale = Run.query.filter_by(status='running').all()
|
||||
for r in stale:
|
||||
# Check heartbeat first (shorter threshold)
|
||||
if r.last_heartbeat and (now - r.last_heartbeat) > _HEARTBEAT_THRESHOLD:
|
||||
_log(f"System: [StaleWatcher] Run #{r.id} heartbeat is {now - r.last_heartbeat} old — marking failed.")
|
||||
r.status = 'failed'
|
||||
r.end_time = now
|
||||
db.session.add(r)
|
||||
# Fallback: check start_time if no heartbeat recorded
|
||||
elif not r.last_heartbeat and r.start_time and (now - r.start_time) > _MAX_RUN_THRESHOLD:
|
||||
_log(f"System: [StaleWatcher] Run #{r.id} running {now - r.start_time} with no heartbeat — marking failed.")
|
||||
r.status = 'failed'
|
||||
r.end_time = now
|
||||
db.session.add(r)
|
||||
db.session.commit()
|
||||
except Exception as _e:
|
||||
_log(f"System: [StaleWatcher] Error during stale-job check: {_e}")
|
||||
|
||||
|
||||
# --- HUEY CONSUMER ---
|
||||
# Start the Huey task consumer in a background thread whenever the app loads.
|
||||
# Guard against the Werkzeug reloader spawning a second consumer in the child process,
|
||||
@@ -173,6 +217,9 @@ if not _is_reloader_child and not _is_testing:
|
||||
_log("System: Launching Huey consumer thread...")
|
||||
_huey_thread = _threading.Thread(target=_start_huey_consumer, daemon=True, name="huey-consumer")
|
||||
_huey_thread.start()
|
||||
_log("System: Launching stale-job watcher thread (checks every 5 min)...")
|
||||
_watcher_thread = _threading.Thread(target=_stale_job_watcher, daemon=True, name="stale-job-watcher")
|
||||
_watcher_thread.start()
|
||||
else:
|
||||
_log(f"System: Skipping Huey consumer (WERKZEUG_RUN_MAIN={os.environ.get('WERKZEUG_RUN_MAIN')}, FLASK_TESTING={os.environ.get('FLASK_TESTING')}).")
|
||||
|
||||
|
||||
Reference in New Issue
Block a user