Auto-commit: v2.14 — Stuck job robustness (heartbeat, retry, stale watcher, granular logging)

- web/db.py: Add last_heartbeat column to Run model
- core/utils.py: Add set_heartbeat_callback() and send_heartbeat()
- web/tasks.py: Add _robust_update_run_status() with 5-retry exponential backoff;
  add db_heartbeat_callback(); remove all bare except:pass on DB status updates;
  set start_time + last_heartbeat when marking run as 'running'
- web/app.py: Add last_heartbeat column migration; add _stale_job_watcher()
  background thread (checks every 5 min, 15-min heartbeat threshold, 2-hr start_time threshold)
- cli/engine.py: Add phase-level logging banners and try/except wrappers in
  process_book(); add utils.send_heartbeat() after each chapter save;
  add start/finish logging in run_generation()

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-02-21 19:00:29 -05:00
parent 97efd51fd5
commit 81340a18ea
6 changed files with 275 additions and 122 deletions

View File

@@ -16,6 +16,48 @@ from export import exporter
# Configure Huey (Task Queue)
huey = SqliteHuey('bookapp_queue', filename=os.path.join(config.DATA_DIR, 'queue.db'))
def _robust_update_run_status(db_path, run_id, status, retries=5, **extra_cols):
"""Update run status with exponential-backoff retry. Raises RuntimeError if all retries fail."""
import sys as _sys
cols = {"status": status}
cols.update(extra_cols)
set_clause = ", ".join(f"{k} = ?" for k in cols)
values = list(cols.values()) + [run_id]
for attempt in range(retries):
try:
with sqlite3.connect(db_path, timeout=30, check_same_thread=False) as conn:
conn.execute(f"UPDATE run SET {set_clause} WHERE id = ?", values)
return
except sqlite3.OperationalError as e:
wait = attempt + 1
print(f"[DB WARN run={run_id}] Status update locked (attempt {attempt+1}/{retries}), retry in {wait}s: {e}", flush=True, file=_sys.stdout)
time.sleep(wait)
except Exception as e:
print(f"[DB ERROR run={run_id}] Unexpected error on status update: {type(e).__name__}: {e}", flush=True, file=_sys.stdout)
raise
msg = f"[DB CRITICAL run={run_id}] Failed to update status='{status}' after {retries} attempts."
print(msg, flush=True, file=_sys.stdout)
raise RuntimeError(msg)
def db_heartbeat_callback(db_path, run_id):
"""Updates last_heartbeat timestamp for the run in SQLite."""
import sys as _sys
for _ in range(3):
try:
with sqlite3.connect(db_path, timeout=10, check_same_thread=False) as conn:
conn.execute("UPDATE run SET last_heartbeat = ? WHERE id = ?",
(datetime.utcnow().isoformat(), run_id))
return
except sqlite3.OperationalError:
time.sleep(0.2)
except Exception as _e:
print(f"[db_heartbeat ERROR run={run_id}] {type(_e).__name__}: {_e}", flush=True, file=_sys.stdout)
return
def db_log_callback(db_path, run_id, phase, msg):
"""Writes log entry directly to SQLite to avoid Flask Context issues in threads."""
import sys as _sys
@@ -86,17 +128,17 @@ def generate_book_task(run_id, project_path, bible_path, allow_copy=True, feedba
db_path = os.path.join(config.DATA_DIR, "bookapp.db")
utils.set_log_callback(lambda p, m: db_log_callback(db_path, run_id, p, m))
utils.set_progress_callback(lambda p: db_progress_callback(db_path, run_id, p))
utils.set_heartbeat_callback(lambda: db_heartbeat_callback(db_path, run_id))
# Set Status to Running
# Set Status to Running (with start_time and initial heartbeat)
try:
with sqlite3.connect(db_path, timeout=30, check_same_thread=False) as conn:
conn.execute("UPDATE run SET status = 'running' WHERE id = ?", (run_id,))
_robust_update_run_status(db_path, run_id, 'running',
start_time=datetime.utcnow().isoformat(),
last_heartbeat=datetime.utcnow().isoformat())
_task_log("Run status set to 'running' in DB.")
except sqlite3.OperationalError as e:
_task_log(f"WARNING: DB locked when setting run status: {e}")
utils.log("SYSTEM", f"Database locked when setting run status (run {run_id}): {e}")
except Exception as e:
_task_log(f"WARNING: Could not set run status: {e}")
_task_log(f"WARNING: Could not set run status to 'running': {e}")
utils.log("SYSTEM", f"WARNING: run status update failed (run {run_id}): {e}")
utils.log("SYSTEM", f"Starting Job #{run_id}")
@@ -227,11 +269,13 @@ def generate_book_task(run_id, project_path, bible_path, allow_copy=True, feedba
# 4. Update Database with Final Status — run is never left in 'running' state
try:
with sqlite3.connect(db_path, timeout=30, check_same_thread=False) as conn:
conn.execute("UPDATE run SET status = ?, cost = ?, end_time = ?, log_file = ?, progress = 100 WHERE id = ?",
(status, total_cost, datetime.utcnow(), final_log_path, run_id))
_robust_update_run_status(db_path, run_id, status,
cost=total_cost,
end_time=datetime.utcnow().isoformat(),
log_file=final_log_path,
progress=100)
except Exception as e:
print(f"Failed to update run status in DB: {e}")
print(f"[CRITICAL run={run_id}] Final status update failed after all retries: {e}", flush=True)
_task_log(f"Task finished. status={status} cost=${total_cost:.4f}")
return {"run_id": run_id, "status": status, "cost": total_cost, "final_log": final_log_path}
@@ -255,10 +299,16 @@ def regenerate_artifacts_task(run_id, project_path, feedback=None):
utils.set_log_file(log_file)
utils.set_log_callback(lambda p, m: db_log_callback(db_path, run_id, p, m))
try:
with sqlite3.connect(db_path) as conn:
with sqlite3.connect(db_path, timeout=30, check_same_thread=False) as conn:
conn.execute("DELETE FROM log_entry WHERE run_id = ?", (run_id,))
conn.execute("UPDATE run SET status = 'running' WHERE id = ?", (run_id,))
except: pass
except Exception as _e:
print(f"[WARN run={run_id}] Could not clear log_entry for regen: {_e}", flush=True)
try:
_robust_update_run_status(db_path, run_id, 'running',
start_time=datetime.utcnow().isoformat(),
last_heartbeat=datetime.utcnow().isoformat())
except Exception as _e:
print(f"[WARN run={run_id}] Could not set status to 'running' for regen: {_e}", flush=True)
utils.log("SYSTEM", "Starting Artifact Regeneration...")
@@ -272,9 +322,9 @@ def regenerate_artifacts_task(run_id, project_path, feedback=None):
if not os.path.exists(run_dir) or not os.path.exists(bible_path):
utils.log("ERROR", "Run directory or Bible not found.")
try:
with sqlite3.connect(db_path) as conn:
conn.execute("UPDATE run SET status = 'failed' WHERE id = ?", (run_id,))
except: pass
_robust_update_run_status(db_path, run_id, 'failed')
except Exception as _e:
print(f"[WARN run={run_id}] Could not set status to 'failed': {_e}", flush=True)
return
bible = utils.load_json(bible_path)
@@ -284,9 +334,9 @@ def regenerate_artifacts_task(run_id, project_path, feedback=None):
if not os.path.exists(final_bp_path) or not os.path.exists(ms_path):
utils.log("ERROR", f"Blueprint or Manuscript not found in {book_dir}")
try:
with sqlite3.connect(db_path) as conn:
conn.execute("UPDATE run SET status = 'failed' WHERE id = ?", (run_id,))
except: pass
_robust_update_run_status(db_path, run_id, 'failed')
except Exception as _e:
print(f"[WARN run={run_id}] Could not set status to 'failed': {_e}", flush=True)
return
bp = utils.load_json(final_bp_path)
@@ -328,9 +378,9 @@ def regenerate_artifacts_task(run_id, project_path, feedback=None):
final_status = 'failed'
try:
with sqlite3.connect(db_path) as conn:
conn.execute("UPDATE run SET status = ? WHERE id = ?", (final_status, run_id))
except: pass
_robust_update_run_status(db_path, run_id, final_status)
except Exception as _e:
print(f"[CRITICAL run={run_id}] Final regen status update failed: {_e}", flush=True)
@huey.task()
@@ -338,6 +388,8 @@ def rewrite_chapter_task(run_id, project_path, book_folder, chap_num, instructio
"""
Background task to rewrite a single chapter and propagate changes.
"""
db_path = os.path.join(config.DATA_DIR, "bookapp.db")
try:
run_dir = os.path.join(project_path, "runs", f"run_{run_id}")
@@ -350,14 +402,19 @@ def rewrite_chapter_task(run_id, project_path, book_folder, chap_num, instructio
except: pass
utils.set_log_file(log_file)
db_path = os.path.join(config.DATA_DIR, "bookapp.db")
utils.set_log_callback(lambda p, m: db_log_callback(db_path, run_id, p, m))
try:
with sqlite3.connect(db_path) as conn:
with sqlite3.connect(db_path, timeout=30, check_same_thread=False) as conn:
conn.execute("DELETE FROM log_entry WHERE run_id = ?", (run_id,))
conn.execute("UPDATE run SET status = 'running' WHERE id = ?", (run_id,))
except: pass
except Exception as _e:
print(f"[WARN run={run_id}] Could not clear log_entry for rewrite: {_e}", flush=True)
try:
_robust_update_run_status(db_path, run_id, 'running',
start_time=datetime.utcnow().isoformat(),
last_heartbeat=datetime.utcnow().isoformat())
except Exception as _e:
print(f"[WARN run={run_id}] Could not set status to 'running' for rewrite: {_e}", flush=True)
book_path = os.path.join(run_dir, book_folder)
ms_path = os.path.join(book_path, "manuscript.json")
@@ -392,22 +449,25 @@ def rewrite_chapter_task(run_id, project_path, book_folder, chap_num, instructio
exporter.compile_files(bp, ms, book_path)
try:
with sqlite3.connect(db_path) as conn:
conn.execute("UPDATE run SET status = 'completed' WHERE id = ?", (run_id,))
except: pass
_robust_update_run_status(db_path, run_id, 'completed',
end_time=datetime.utcnow().isoformat())
except Exception as _e:
print(f"[WARN run={run_id}] Could not set status to 'completed': {_e}", flush=True)
return True
try:
with sqlite3.connect(db_path) as conn:
conn.execute("UPDATE run SET status = 'completed' WHERE id = ?", (run_id,))
except: pass
_robust_update_run_status(db_path, run_id, 'completed',
end_time=datetime.utcnow().isoformat())
except Exception as _e:
print(f"[WARN run={run_id}] Could not set status to 'completed': {_e}", flush=True)
return False
except Exception as e:
utils.log("ERROR", f"Rewrite task exception for run {run_id}/{book_folder}: {e}")
try:
with sqlite3.connect(db_path) as conn:
conn.execute("UPDATE run SET status = 'failed' WHERE id = ?", (run_id,))
except: pass
_robust_update_run_status(db_path, run_id, 'failed',
end_time=datetime.utcnow().isoformat())
except Exception as _e:
print(f"[CRITICAL run={run_id}] Could not set status to 'failed' after rewrite error: {_e}", flush=True)
return False