- core/utils.py: Added estimate_tokens(), truncate_to_tokens(), get_ai_cache(), set_ai_cache(), make_cache_key() utilities - story/writer.py: Applied truncate_to_tokens() to prev_content (2000 tokens) and prev_sum (600 tokens) context injections - story/editor.py: Applied truncate_to_tokens() to summary (1000t), last_chapter_text (800t), eval text (7500t), propagation contexts (2500t/3000t) - web/routes/persona.py: Added MD5-keyed in-memory cache for persona analyze endpoint; truncated sample_text to 750 tokens - ai/models.py: Added pre-dispatch payload size estimation with 30k-token warning threshold Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
86 lines
2.9 KiB
Python
86 lines
2.9 KiB
Python
import os
|
|
import json
|
|
import time
|
|
import warnings
|
|
import google.generativeai as genai
|
|
from core import utils
|
|
|
|
# Suppress Vertex AI warnings
|
|
warnings.filterwarnings("ignore", category=UserWarning, module="vertexai")
|
|
|
|
try:
|
|
import vertexai
|
|
from vertexai.preview.vision_models import ImageGenerationModel as VertexImageModel
|
|
HAS_VERTEX = True
|
|
except ImportError:
|
|
HAS_VERTEX = False
|
|
|
|
try:
|
|
from google.auth.transport.requests import Request
|
|
from google.oauth2.credentials import Credentials
|
|
from google_auth_oauthlib.flow import InstalledAppFlow
|
|
HAS_OAUTH = True
|
|
except ImportError:
|
|
HAS_OAUTH = False
|
|
|
|
model_logic = None
|
|
model_writer = None
|
|
model_artist = None
|
|
model_image = None
|
|
logic_model_name = "models/gemini-1.5-pro"
|
|
writer_model_name = "models/gemini-1.5-flash"
|
|
artist_model_name = "models/gemini-1.5-flash"
|
|
image_model_name = None
|
|
image_model_source = "None"
|
|
|
|
|
|
class ResilientModel:
|
|
def __init__(self, name, safety_settings, role):
|
|
self.name = name
|
|
self.safety_settings = safety_settings
|
|
self.role = role
|
|
self.model = genai.GenerativeModel(name, safety_settings=safety_settings)
|
|
|
|
def update(self, name):
|
|
self.name = name
|
|
self.model = genai.GenerativeModel(name, safety_settings=self.safety_settings)
|
|
|
|
_TOKEN_WARN_LIMIT = 30_000
|
|
|
|
def generate_content(self, *args, **kwargs):
|
|
# Estimate payload size and warn if it exceeds the safe limit
|
|
if args:
|
|
payload = args[0]
|
|
if isinstance(payload, str):
|
|
est = utils.estimate_tokens(payload)
|
|
elif isinstance(payload, list):
|
|
est = sum(utils.estimate_tokens(p) if isinstance(p, str) else 0 for p in payload)
|
|
else:
|
|
est = 0
|
|
if est > self._TOKEN_WARN_LIMIT:
|
|
utils.log("SYSTEM", f"⚠️ Payload warning: ~{est:,} tokens for {self.role} ({self.name}). Consider reducing context.")
|
|
|
|
retries = 0
|
|
max_retries = 3
|
|
base_delay = 5
|
|
|
|
while True:
|
|
try:
|
|
return self.model.generate_content(*args, **kwargs)
|
|
except Exception as e:
|
|
err_str = str(e).lower()
|
|
is_retryable = "429" in err_str or "quota" in err_str or "500" in err_str or "503" in err_str or "504" in err_str or "deadline" in err_str or "internal error" in err_str
|
|
if is_retryable and retries < max_retries:
|
|
delay = base_delay * (2 ** retries)
|
|
utils.log("SYSTEM", f"⚠️ Quota error on {self.role} ({self.name}). Retrying in {delay}s...")
|
|
time.sleep(delay)
|
|
|
|
if retries == 0:
|
|
utils.log("SYSTEM", "Attempting to re-optimize models to find alternative...")
|
|
from ai import setup as _setup
|
|
_setup.init_models(force=True)
|
|
|
|
retries += 1
|
|
continue
|
|
raise e
|