Auto-commit: v2.9 — Fix background task hangs (OAuth headless guard, SQLite timeouts, log touch)

- ai/setup.py: Added threading import; OAuth block now detects background/headless
  threads and skips run_local_server to prevent indefinite blocking. Logs a clear
  warning and falls back to ADC for Vertex AI. Token file only written when creds
  are not None.
- web/tasks.py: All sqlite3.connect() calls now use timeout=30, check_same_thread=False.
  OperationalError on the initial status update is caught and logged via utils.log.
  generate_book_task now touches initial_log immediately so the UI polling endpoint
  always finds an existing file even if the worker crashes on the next line.
- ai_blueprint.md: Bumped to v2.9; Section 12.D sub-items 1-3 marked ; item 13
  added to summary.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-02-21 10:50:00 -05:00
parent c2d6936aa5
commit 1f01fedf00
3 changed files with 47 additions and 12 deletions

View File

@@ -2,6 +2,7 @@ import os
import json import json
import time import time
import warnings import warnings
import threading
import google.generativeai as genai import google.generativeai as genai
from core import config, utils from core import config, utils
from ai import models from ai import models
@@ -256,19 +257,30 @@ def init_models(force=False):
if os.path.exists(token_path): if os.path.exists(token_path):
creds = models.Credentials.from_authorized_user_file(token_path, SCOPES) creds = models.Credentials.from_authorized_user_file(token_path, SCOPES)
_is_headless = threading.current_thread() is not threading.main_thread()
if not creds or not creds.valid: if not creds or not creds.valid:
if creds and creds.expired and creds.refresh_token: if creds and creds.expired and creds.refresh_token:
try: try:
creds.refresh(models.Request()) creds.refresh(models.Request())
except Exception: except Exception:
utils.log("SYSTEM", "Token refresh failed. Re-authenticating...") if _is_headless:
utils.log("SYSTEM", "⚠️ Token refresh failed and cannot re-authenticate in a background/headless thread. Vertex AI will use ADC or be unavailable.")
creds = None
else:
utils.log("SYSTEM", "Token refresh failed. Re-authenticating...")
flow = models.InstalledAppFlow.from_client_secrets_file(gac, SCOPES)
creds = flow.run_local_server(port=0)
else:
if _is_headless:
utils.log("SYSTEM", "⚠️ OAuth Client ID requires browser login but running in headless/background mode. Skipping interactive auth. Use a Service Account key for Vertex AI in background tasks.")
creds = None
else:
utils.log("SYSTEM", "OAuth Client ID detected. Launching browser to authenticate...")
flow = models.InstalledAppFlow.from_client_secrets_file(gac, SCOPES) flow = models.InstalledAppFlow.from_client_secrets_file(gac, SCOPES)
creds = flow.run_local_server(port=0) creds = flow.run_local_server(port=0)
else: if creds:
utils.log("SYSTEM", "OAuth Client ID detected. Launching browser to authenticate...") with open(token_path, 'w') as token: token.write(creds.to_json())
flow = models.InstalledAppFlow.from_client_secrets_file(gac, SCOPES)
creds = flow.run_local_server(port=0)
with open(token_path, 'w') as token: token.write(creds.to_json())
utils.log("SYSTEM", "✅ Authenticated via OAuth Client ID.") utils.log("SYSTEM", "✅ Authenticated via OAuth Client ID.")
except Exception as e: except Exception as e:

View File

@@ -1,4 +1,4 @@
# AI Context Optimization Blueprint (v2.8) # AI Context Optimization Blueprint (v2.9)
This blueprint outlines architectural improvements for how AI context is managed during the writing process. The goal is to provide the AI (Claude/Gemini) with **better, highly-targeted context upfront**, which will dramatically improve first-draft quality and reduce the reliance on expensive, time-consuming quality checks and rewrites (currently up to 5 attempts). This blueprint outlines architectural improvements for how AI context is managed during the writing process. The goal is to provide the AI (Claude/Gemini) with **better, highly-targeted context upfront**, which will dramatically improve first-draft quality and reduce the reliance on expensive, time-consuming quality checks and rewrites (currently up to 5 attempts).
@@ -147,6 +147,16 @@ Three bugs combined to produce a blank page or silent failure when creating a ne
5.`web/routes/project.py` (`project_setup_wizard`): When `model_logic` was `None`, the route silently redirected to the dashboard with a flash the user missed. Now renders the setup form with a complete default suggestions dict (all fields populated, lists as `[]`) and a visible `"warning"` flash so the user can fill in details manually. *(Implemented v2.8)* 5.`web/routes/project.py` (`project_setup_wizard`): When `model_logic` was `None`, the route silently redirected to the dashboard with a flash the user missed. Now renders the setup form with a complete default suggestions dict (all fields populated, lists as `[]`) and a visible `"warning"` flash so the user can fill in details manually. *(Implemented v2.8)*
6.`web/routes/project.py` (`create_project_final`): `planner.enrich()` was called with the full project bible dict. `enrich()` reads `bp.get('manual_instruction')` from the top level (got `'A generic story'` fallback — the real concept was in `bible['books'][0]['manual_instruction']`), and wrote enriched data into a new `book_metadata` key instead of the bible's `books[0]`. Fixed to build a proper per-book blueprint, call enrich, and merge `characters`, `plot_beats`, and `structure_prompt` back into the correct bible locations. *(Implemented v2.8)* 6.`web/routes/project.py` (`create_project_final`): `planner.enrich()` was called with the full project bible dict. `enrich()` reads `bp.get('manual_instruction')` from the top level (got `'A generic story'` fallback — the real concept was in `bible['books'][0]['manual_instruction']`), and wrote enriched data into a new `book_metadata` key instead of the bible's `books[0]`. Fixed to build a proper per-book blueprint, call enrich, and merge `characters`, `plot_beats`, and `structure_prompt` back into the correct bible locations. *(Implemented v2.8)*
### D. "Waiting for logs" / "Preparing environment" Background Task Hangs
The UI gets stuck indefinitely because the background Huey worker thread hangs before emitting the first "Starting Job" log, or fails to connect to the database.
**Places that impact this and their fixes:**
1.**OAuth Browser Prompt in Background Thread**: `ai/setup.py` — Added `import threading`; the OAuth block now checks `threading.current_thread() is not threading.main_thread()`. If running headlessly, `run_local_server` is skipped, `creds` is set to `None`, and a clear warning is logged. Vertex AI falls back to ADC. Token is only written if `creds` is not `None`. *(Implemented v2.9)*
2.**SQLite Database Locking Timeout**: `web/tasks.py` — All `sqlite3.connect()` calls now use `timeout=30, check_same_thread=False`. The initial status-update `OperationalError` is caught and logged via `utils.log` so it appears in the log file rather than silently disappearing. *(Implemented v2.9)*
3.**Missing Initial Log File Creation**: `web/tasks.py` `generate_book_task` — The `initial_log` path is now `open(…, 'a')`-touched immediately after construction and before `utils.set_log_file()`, guaranteeing the file exists for UI polling even if the worker crashes on the very next line. *(Implemented v2.9)*
## Summary of Actionable Changes for Implementation Mode: ## Summary of Actionable Changes for Implementation Mode:
1. ✅ Modify `writer.py` to filter `chars_for_writer` based on characters named in `beats`. *(Implemented in v1.5.0)* 1. ✅ Modify `writer.py` to filter `chars_for_writer` based on characters named in `beats`. *(Implemented in v1.5.0)*
2. ✅ Modify `writer.py` `prev_content` logic to extract the *tail* of the chapter, not a blind slice. *(Implemented in v1.5.0 via `utils.truncate_to_tokens` tail logic)* 2. ✅ Modify `writer.py` `prev_content` logic to extract the *tail* of the chapter, not a blind slice. *(Implemented in v1.5.0 via `utils.truncate_to_tokens` tail logic)*
@@ -160,3 +170,4 @@ Three bugs combined to produce a blank page or silent failure when creating a ne
10.**(v2.6)** "Redo Book" form in `consistency_report.html` + `revise_book` route in `run.py` that creates a new run with the instruction applied as bible feedback. *(Implemented v2.6)* 10.**(v2.6)** "Redo Book" form in `consistency_report.html` + `revise_book` route in `run.py` that creates a new run with the instruction applied as bible feedback. *(Implemented v2.6)*
11.**(v2.7)** Series Continuity Fix: `series_metadata` (is_series, series_title, book_number, total_books) injected as `SERIES_CONTEXT` into `story/planner.py` (`enrich`, `plan_structure`), `story/writer.py` (`write_chapter`), and `story/editor.py` (`evaluate_chapter_quality`) prompts with position-aware guidance per book number. *(Implemented v2.7)* 11.**(v2.7)** Series Continuity Fix: `series_metadata` (is_series, series_title, book_number, total_books) injected as `SERIES_CONTEXT` into `story/planner.py` (`enrich`, `plan_structure`), `story/writer.py` (`write_chapter`), and `story/editor.py` (`evaluate_chapter_quality`) prompts with position-aware guidance per book number. *(Implemented v2.7)*
12.**(v2.8)** Infrastructure & UI Bug Fixes: API timeouts (180s generation, 30s list_models) in `ai/models.py` + `ai/setup.py`; Huey consumer moved to module level with reloader guard in `web/app.py`; Jinja2 `UndefinedError` fix for `tropes`/`formatting_rules` in `project_setup.html`; `project_setup_wizard` now renders form instead of silent redirect when models fail; `create_project_final` `enrich()` call fixed to use correct per-book blueprint structure. *(Implemented v2.8)* 12.**(v2.8)** Infrastructure & UI Bug Fixes: API timeouts (180s generation, 30s list_models) in `ai/models.py` + `ai/setup.py`; Huey consumer moved to module level with reloader guard in `web/app.py`; Jinja2 `UndefinedError` fix for `tropes`/`formatting_rules` in `project_setup.html`; `project_setup_wizard` now renders form instead of silent redirect when models fail; `create_project_final` `enrich()` call fixed to use correct per-book blueprint structure. *(Implemented v2.8)*
13.**(v2.9)** Background Task Hang Fixes: OAuth headless guard in `ai/setup.py` (skips `run_local_server` in non-main threads, logs warning, falls back to ADC); SQLite `timeout=30, check_same_thread=False` on all connections in `web/tasks.py`; initial log file touched immediately in `generate_book_task` so UI polling never sees an empty/missing file. *(Implemented v2.9)*

View File

@@ -20,7 +20,7 @@ def db_log_callback(db_path, run_id, phase, msg):
"""Writes log entry directly to SQLite to avoid Flask Context issues in threads.""" """Writes log entry directly to SQLite to avoid Flask Context issues in threads."""
for _ in range(5): for _ in range(5):
try: try:
with sqlite3.connect(db_path, timeout=5) as conn: with sqlite3.connect(db_path, timeout=30, check_same_thread=False) as conn:
conn.execute("INSERT INTO log_entry (run_id, timestamp, phase, message) VALUES (?, ?, ?, ?)", conn.execute("INSERT INTO log_entry (run_id, timestamp, phase, message) VALUES (?, ?, ?, ?)",
(run_id, datetime.utcnow(), phase, str(msg))) (run_id, datetime.utcnow(), phase, str(msg)))
break break
@@ -32,7 +32,7 @@ def db_progress_callback(db_path, run_id, percent):
"""Updates run progress in SQLite.""" """Updates run progress in SQLite."""
for _ in range(5): for _ in range(5):
try: try:
with sqlite3.connect(db_path, timeout=5) as conn: with sqlite3.connect(db_path, timeout=30, check_same_thread=False) as conn:
conn.execute("UPDATE run SET progress = ? WHERE id = ?", (percent, run_id)) conn.execute("UPDATE run SET progress = ? WHERE id = ?", (percent, run_id))
break break
except sqlite3.OperationalError: time.sleep(0.1) except sqlite3.OperationalError: time.sleep(0.1)
@@ -48,6 +48,15 @@ def generate_book_task(run_id, project_path, bible_path, allow_copy=True, feedba
# Log to project root initially until run folder is created by engine # Log to project root initially until run folder is created by engine
initial_log = os.path.join(project_path, log_filename) initial_log = os.path.join(project_path, log_filename)
# Touch the file immediately so the UI has something to poll even if the
# worker crashes before the first utils.log() call.
try:
with open(initial_log, 'a', encoding='utf-8') as _f:
pass
except Exception:
pass
utils.set_log_file(initial_log) utils.set_log_file(initial_log)
# Hook up Database Logging # Hook up Database Logging
@@ -57,9 +66,12 @@ def generate_book_task(run_id, project_path, bible_path, allow_copy=True, feedba
# Set Status to Running # Set Status to Running
try: try:
with sqlite3.connect(db_path, timeout=10) as conn: with sqlite3.connect(db_path, timeout=30, check_same_thread=False) as conn:
conn.execute("UPDATE run SET status = 'running' WHERE id = ?", (run_id,)) conn.execute("UPDATE run SET status = 'running' WHERE id = ?", (run_id,))
except: pass except sqlite3.OperationalError as e:
utils.log("SYSTEM", f"⚠️ Database locked when setting run status (run {run_id}): {e}")
except Exception:
pass
utils.log("SYSTEM", f"Starting Job #{run_id}") utils.log("SYSTEM", f"Starting Job #{run_id}")
@@ -185,7 +197,7 @@ def generate_book_task(run_id, project_path, bible_path, allow_copy=True, feedba
# 4. Update Database with Final Status # 4. Update Database with Final Status
try: try:
with sqlite3.connect(db_path, timeout=10) as conn: with sqlite3.connect(db_path, timeout=30, check_same_thread=False) as conn:
conn.execute("UPDATE run SET status = ?, cost = ?, end_time = ?, log_file = ?, progress = 100 WHERE id = ?", conn.execute("UPDATE run SET status = ?, cost = ?, end_time = ?, log_file = ?, progress = 100 WHERE id = ?",
(status, total_cost, datetime.utcnow(), final_log_path, run_id)) (status, total_cost, datetime.utcnow(), final_log_path, run_id))
except Exception as e: except Exception as e: