Files
bookapp/web/app.py
Mike Wichers 81340a18ea Auto-commit: v2.14 — Stuck job robustness (heartbeat, retry, stale watcher, granular logging)
- web/db.py: Add last_heartbeat column to Run model
- core/utils.py: Add set_heartbeat_callback() and send_heartbeat()
- web/tasks.py: Add _robust_update_run_status() with 5-retry exponential backoff;
  add db_heartbeat_callback(); remove all bare except:pass on DB status updates;
  set start_time + last_heartbeat when marking run as 'running'
- web/app.py: Add last_heartbeat column migration; add _stale_job_watcher()
  background thread (checks every 5 min, 15-min heartbeat threshold, 2-hr start_time threshold)
- cli/engine.py: Add phase-level logging banners and try/except wrappers in
  process_book(); add utils.send_heartbeat() after each chapter save;
  add start/finish logging in run_generation()

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-02-21 19:00:29 -05:00

229 lines
9.0 KiB
Python

import os
import sys
import platform
from datetime import datetime
from sqlalchemy import text
from flask import Flask
from flask_login import LoginManager
from werkzeug.security import generate_password_hash
from web.db import db, User, Run
from web.tasks import huey
from core import config
# Ensure stdout is UTF-8 in all environments (Docker, Windows, Raspberry Pi)
if hasattr(sys.stdout, 'reconfigure'):
try:
sys.stdout.reconfigure(encoding='utf-8', errors='replace')
sys.stderr.reconfigure(encoding='utf-8', errors='replace')
except Exception:
pass
def _log(msg):
"""Print to stdout with flush so Docker logs capture it immediately."""
print(msg, flush=True)
# Calculate paths relative to this file (web/app.py -> project root is two levels up)
BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
TEMPLATE_DIR = os.path.join(BASE_DIR, 'templates')
app = Flask(__name__, template_folder=TEMPLATE_DIR)
app.url_map.strict_slashes = False
app.config['SECRET_KEY'] = config.FLASK_SECRET
app.config['SQLALCHEMY_DATABASE_URI'] = f'sqlite:///{os.path.join(config.DATA_DIR, "bookapp.db")}'
db.init_app(app)
login_manager = LoginManager()
login_manager.login_view = 'auth.login'
login_manager.init_app(app)
@login_manager.user_loader
def load_user(user_id):
return db.session.get(User, int(user_id))
@app.context_processor
def inject_globals():
return dict(app_version=config.VERSION)
# Register Blueprints
from web.routes.auth import auth_bp
from web.routes.project import project_bp
from web.routes.run import run_bp
from web.routes.persona import persona_bp
from web.routes.admin import admin_bp
app.register_blueprint(auth_bp)
app.register_blueprint(project_bp)
app.register_blueprint(run_bp)
app.register_blueprint(persona_bp)
app.register_blueprint(admin_bp)
# --- STARTUP DIAGNOSTIC BANNER ---
_log("=" * 60)
_log(f"BookApp v{config.VERSION} starting up")
_log(f" Python : {sys.version}")
_log(f" Platform : {platform.platform()}")
_log(f" Data dir : {config.DATA_DIR}")
_log(f" Queue db : {os.path.join(config.DATA_DIR, 'queue.db')}")
_log(f" App db : {os.path.join(config.DATA_DIR, 'bookapp.db')}")
try:
import huey as _huey_pkg
_log(f" Huey : {_huey_pkg.__version__}")
except Exception:
_log(" Huey : (version unknown)")
_log("=" * 60)
# --- SETUP ---
with app.app_context():
db.create_all()
# Auto-create Admin from Environment Variables (Docker/Portainer Setup)
if config.ADMIN_USER and config.ADMIN_PASSWORD:
admin = User.query.filter_by(username=config.ADMIN_USER).first()
if not admin:
_log(f"System: Creating Admin User '{config.ADMIN_USER}' from environment variables.")
admin = User(username=config.ADMIN_USER, password=generate_password_hash(config.ADMIN_PASSWORD, method='pbkdf2:sha256'), is_admin=True)
db.session.add(admin)
db.session.commit()
else:
_log(f"System: Syncing Admin User '{config.ADMIN_USER}' settings from environment.")
if not admin.is_admin: admin.is_admin = True
admin.password = generate_password_hash(config.ADMIN_PASSWORD, method='pbkdf2:sha256')
db.session.add(admin)
db.session.commit()
elif not User.query.filter_by(is_admin=True).first():
_log("System: No Admin credentials found in environment variables. Admin account not created.")
# Migration: Add 'progress' column if missing
try:
with db.engine.connect() as conn:
conn.execute(text("ALTER TABLE run ADD COLUMN progress INTEGER DEFAULT 0"))
conn.commit()
_log("System: Added 'progress' column to Run table.")
except: pass
# Migration: Add 'last_heartbeat' column if missing
try:
with db.engine.connect() as conn:
conn.execute(text("ALTER TABLE run ADD COLUMN last_heartbeat DATETIME"))
conn.commit()
_log("System: Added 'last_heartbeat' column to Run table.")
except: pass
# Reset stuck runs on startup
try:
stuck_runs = Run.query.filter_by(status='running').all()
if stuck_runs:
_log(f"System: Found {len(stuck_runs)} stuck run(s) — resetting to 'failed'.")
for r in stuck_runs:
r.status = 'failed'
r.end_time = datetime.utcnow()
db.session.commit()
# Also reset stuck 'queued' runs whose task entry was lost from queue.db
import sqlite3 as _sqlite3
_queue_path = os.path.join(config.DATA_DIR, 'queue.db')
if os.path.exists(_queue_path):
with _sqlite3.connect(_queue_path, timeout=5) as _qconn:
pending_count = _qconn.execute("SELECT COUNT(*) FROM task").fetchone()[0]
queued_runs = Run.query.filter_by(status='queued').count()
_log(f"System: Queue has {pending_count} pending task(s), DB has {queued_runs} queued run(s).")
if queued_runs > 0 and pending_count == 0:
_log("System: WARNING — queued runs exist but queue is empty (tasks lost). Resetting to 'failed'.")
Run.query.filter_by(status='queued').update({'status': 'failed', 'end_time': datetime.utcnow()})
db.session.commit()
except Exception as e:
_log(f"System: Startup cleanup error: {e}")
# --- STALE JOB WATCHER ---
# Background thread that periodically detects jobs where the heartbeat has
# gone silent (>15 min) or the total run has exceeded 2 hours.
def _stale_job_watcher():
import time as _time
from datetime import datetime as _dt, timedelta as _td
_HEARTBEAT_THRESHOLD = _td(minutes=15)
_MAX_RUN_THRESHOLD = _td(hours=2)
_CHECK_INTERVAL = 5 * 60 # seconds
while True:
_time.sleep(_CHECK_INTERVAL)
try:
with app.app_context():
now = _dt.utcnow()
stale = Run.query.filter_by(status='running').all()
for r in stale:
# Check heartbeat first (shorter threshold)
if r.last_heartbeat and (now - r.last_heartbeat) > _HEARTBEAT_THRESHOLD:
_log(f"System: [StaleWatcher] Run #{r.id} heartbeat is {now - r.last_heartbeat} old — marking failed.")
r.status = 'failed'
r.end_time = now
db.session.add(r)
# Fallback: check start_time if no heartbeat recorded
elif not r.last_heartbeat and r.start_time and (now - r.start_time) > _MAX_RUN_THRESHOLD:
_log(f"System: [StaleWatcher] Run #{r.id} running {now - r.start_time} with no heartbeat — marking failed.")
r.status = 'failed'
r.end_time = now
db.session.add(r)
db.session.commit()
except Exception as _e:
_log(f"System: [StaleWatcher] Error during stale-job check: {_e}")
# --- HUEY CONSUMER ---
# Start the Huey task consumer in a background thread whenever the app loads.
# Guard against the Werkzeug reloader spawning a second consumer in the child process,
# and against test runners or importers that should not start background workers.
import threading as _threading
def _start_huey_consumer():
import logging as _logging
# INFO level so task pick-up/completion appears in docker logs
_logging.basicConfig(
level=_logging.INFO,
format='[%(asctime)s] HUEY %(levelname)s | %(message)s',
datefmt='%H:%M:%S',
stream=sys.stdout,
force=True,
)
try:
from huey.consumer import Consumer
# NOTE: Huey 2.6.0 does NOT accept a `loglevel` kwarg — omit it.
consumer = Consumer(huey, workers=1, worker_type='thread')
_log("System: Huey task consumer started successfully.")
consumer.run() # blocks until app exits
except Exception as e:
msg = f"System: Huey consumer FAILED to start: {type(e).__name__}: {e}"
_log(msg)
# Also write to a persistent file for diagnosis when stdout is piped away
try:
_err_path = os.path.join(config.DATA_DIR, "consumer_error.log")
with open(_err_path, 'a', encoding='utf-8') as _f:
_f.write(f"[{datetime.utcnow().isoformat()}] {msg}\n")
except Exception:
pass
_is_reloader_child = os.environ.get('WERKZEUG_RUN_MAIN') == 'true'
_is_testing = os.environ.get('FLASK_TESTING') == '1'
if not _is_reloader_child and not _is_testing:
_log("System: Launching Huey consumer thread...")
_huey_thread = _threading.Thread(target=_start_huey_consumer, daemon=True, name="huey-consumer")
_huey_thread.start()
_log("System: Launching stale-job watcher thread (checks every 5 min)...")
_watcher_thread = _threading.Thread(target=_stale_job_watcher, daemon=True, name="stale-job-watcher")
_watcher_thread.start()
else:
_log(f"System: Skipping Huey consumer (WERKZEUG_RUN_MAIN={os.environ.get('WERKZEUG_RUN_MAIN')}, FLASK_TESTING={os.environ.get('FLASK_TESTING')}).")
if __name__ == "__main__":
app.run(host='0.0.0.0', port=5000, debug=False)