Files
William Gill c07e962cae Image tools: migrate to OWUI 0.9.0 async model accessors
Open WebUI 0.9.0 made every model-class accessor (Users.get_user_by_id,
Chats.get_chat_by_id, Files.get_file_by_id, …) a coroutine. Both tools
were still calling them synchronously, so the calls returned coroutines
instead of model objects; the first downstream attribute access threw,
the bare `except Exception: return False` swallowed it, and uploads
silently fell through to the data-URI fallback. The data-URI markdown
rendered during streaming but didn't survive post-stream commit, which
looked like "image flashes in, then disappears."

Add await to the six call sites; promote `_read_file_dict` to async
since it now contains an await; restore `_push_image_to_chat` to the
canonical `files` event so the file-attachment chrome (thumbnail +
download) comes back.

This supersedes commit d034700, which mis-diagnosed the symptom as a
virtualization regression and switched to a `message`-event markdown
workaround. The workaround didn't help (same flash-and-vanish) because
the upload pre-check still failed for the same async-migration reason
and the data-URI fallback path still ran.

smart_image_gen.py 0.7.9 -> 0.7.10
smart_image_pipe.py 0.1.1 -> 0.1.2

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-26 06:16:02 -05:00

613 lines
25 KiB
Python

"""
title: Smart Image Studio (Pipe)
author: ai-stack
version: 0.1.2
description: Deterministic image-gen / edit / inpaint pipe — no LLM in the
loop for the routing decision. Registers as a model in the chat-model
dropdown ('Image Studio (Pipe)'). Reads the user's message + attached
image (if any), routes via regex, calls ComfyUI directly, returns the
image. Use when LLM-with-Tool tool-calling is leaking the call as text
(the abliterated Qwen 3.5 / Open WebUI parser interop bug).
required_open_webui_version: 0.5.0
"""
import asyncio
import base64
import inspect
import io
import json
import re
import time
import uuid
from typing import Awaitable, Callable, Literal, Optional
import aiohttp
from pydantic import BaseModel, Field
# Open WebUI runtime imports — same defensive guard as the sibling Tool.
try:
from fastapi import UploadFile
from open_webui.models.chats import Chats
from open_webui.models.files import Files
from open_webui.models.users import Users
from open_webui.routers.files import upload_file_handler
_OPENWEBUI_RUNTIME = True
except ImportError:
_OPENWEBUI_RUNTIME = False
# ─────────────────────────────────────────────────────────────────────────────
# Per-style settings — kept in sync with smart_image_gen.py. If you change
# checkpoint filenames in comfyui-init-models.sh, update both files.
# ─────────────────────────────────────────────────────────────────────────────
STYLES = {
"photo": {
"ckpt": "CyberRealisticXLPlay_V8.0_FP16.safetensors",
"sampler": "dpmpp_2m_sde",
"scheduler": "karras",
"cfg": 4.0, "steps": 28, "clip_skip": 1,
"prefix": "",
"negative": (
"cartoon, drawing, illustration, anime, manga, painting, sketch, "
"render, 3d, cgi, plastic skin, oversaturated, "
"lowres, blurry, jpeg artifacts, low quality, worst quality, "
"bad anatomy, deformed, extra fingers, missing fingers, "
"watermark, signature, text, logo"
),
},
"juggernaut": {
"ckpt": "Juggernaut-XL_v9_RunDiffusionPhoto_v2.safetensors",
"sampler": "dpmpp_2m_sde",
"scheduler": "karras",
"cfg": 4.5, "steps": 35, "clip_skip": 1,
"prefix": "",
"negative": (
"cartoon, drawing, illustration, anime, painting, sketch, render, "
"3d, cgi, plastic skin, washed out, "
"lowres, blurry, jpeg artifacts, low quality, worst quality, "
"bad anatomy, deformed, extra fingers, missing fingers, "
"watermark, signature, text, logo"
),
},
"pony": {
"ckpt": "ponyDiffusionV6XL_v6StartWithThisOne.safetensors",
"sampler": "euler_ancestral",
"scheduler": "normal",
"cfg": 7.5, "steps": 25, "clip_skip": 2,
"prefix": "score_9, score_8_up, score_7_up, score_6_up, score_5_up, score_4_up, ",
"negative": (
"score_6, score_5, score_4, "
"worst quality, low quality, lowres, blurry, jpeg artifacts, "
"bad anatomy, bad hands, extra digit, fewer digits, "
"deformed, ugly, censored, monochrome, "
"watermark, signature, text, artist name"
),
},
"general": {
"ckpt": "talmendoxlSDXL_v11Beta.safetensors",
"sampler": "dpmpp_2m",
"scheduler": "karras",
"cfg": 8.0, "steps": 30, "clip_skip": 2,
"prefix": "",
"negative": (
"lowres, blurry, jpeg artifacts, low quality, worst quality, "
"bad anatomy, deformed, ugly, watermark, signature, text"
),
},
"furry-nai": {
"ckpt": "reedFURRYMixSDXL_v23nai.safetensors",
"sampler": "euler_ancestral",
"scheduler": "normal",
"cfg": 5.0, "steps": 30, "clip_skip": 2,
"prefix": (
"masterpiece, best quality, high quality, detailed eyes, "
"highres, absurdres, furry, "
),
"negative": (
"human, realistic, photorealistic, 3d, cgi, "
"worst quality, low quality, lowres, blurry, jpeg artifacts, "
"bad anatomy, extra digit, fewer digits, deformed, ugly, "
"watermark, signature, text"
),
},
"furry-noob": {
"ckpt": "indigoVoidFurryFusedXL_noobaiV32.safetensors",
"sampler": "euler_ancestral",
"scheduler": "normal",
"cfg": 4.5, "steps": 20, "clip_skip": 2,
"prefix": (
"masterpiece, best quality, perfect quality, absurdres, newest, "
"very aesthetic, vibrant colors, "
),
"negative": (
"human, realistic, photorealistic, 3d, cgi, shiny skin, "
"worst quality, low quality, lowres, blurry, jpeg artifacts, "
"bad anatomy, bad hands, mutated hands, "
"watermark, signature, text"
),
},
"furry-il": {
"ckpt": "novaFurryXL_ilV170.safetensors",
"sampler": "euler_ancestral",
"scheduler": "normal",
"cfg": 4.0, "steps": 30, "clip_skip": 2,
"prefix": (
"masterpiece, best quality, amazing quality, very aesthetic, "
"ultra-detailed, absurdres, newest, furry, anthro, "
),
"negative": (
"human, multiple tails, modern, recent, old, oldest, graphic, "
"cartoon, painting, deformed, mutated, ugly, lowres, "
"bad anatomy, bad hands, missing fingers, extra digits, "
"worst quality, bad quality, sketch, jpeg artifacts, "
"signature, watermark, text, simple background"
),
},
}
DEFAULT_STYLE = "furry-il"
ROUTING_RULES = [
(re.compile(r"\bscore_\d", re.I), "pony"),
(re.compile(r"\bpony\b", re.I), "pony"),
(re.compile(r"\b(noobai|noob)\b", re.I), "furry-noob"),
(re.compile(r"\b(illustrious|ilxl)\b", re.I), "furry-il"),
(re.compile(r"\b(furry|anthro|feral|kemono|fursona|species)\b", re.I), "furry-il"),
(re.compile(r"\b(juggernaut)\b", re.I), "juggernaut"),
(re.compile(r"\b(photo|photograph|realistic|portrait|selfie|cinematic)\b", re.I), "photo"),
(re.compile(r"\b(anime|manga|2d|illustration)\b", re.I), "pony"),
]
# Phrases that imply local-only editing → triggers inpaint mode and
# pulls out a noun phrase as the mask text.
INPAINT_PATTERNS = [
re.compile(r"\b(?:change|recolor|edit|modify|replace|remove|delete|add)\s+(?:the|that|her|his|its)\s+([\w\s'-]{2,30}?)(?:\s+(?:to|into|with|so|that|and|,|\.)|$)", re.I),
re.compile(r"\b(?:make|turn)\s+(?:the|that|her|his|its)\s+([\w\s'-]{2,30}?)\s+(?:bigger|smaller|larger|wider|taller|shorter|longer|brighter|darker|red|blue|green|yellow|orange|purple|pink|black|white|gold)", re.I),
re.compile(r"\b(?:only|just)\s+(?:the|change the|edit the)\s+([\w\s'-]{2,30}?)(?:\s+|$)", re.I),
]
def _route_style(prompt: str) -> str:
for pattern, style in ROUTING_RULES:
if pattern.search(prompt):
return style
return DEFAULT_STYLE
def _detect_mask_text(prompt: str) -> Optional[str]:
"""Pull a noun phrase out of edit-style instructions for inpaint."""
for pattern in INPAINT_PATTERNS:
m = pattern.search(prompt)
if m:
obj = m.group(1).strip().rstrip(",.").strip()
if obj:
return f"the {obj}"
return None
def _inherited_style(messages) -> Optional[str]:
"""Best-effort: read prior assistant message metadata for a style hint."""
if not messages:
return None
for msg in reversed(messages):
if not isinstance(msg, dict):
continue
# Look for a "style: X" comment in the assistant's previous text
if msg.get("role") == "assistant":
content = msg.get("content")
if isinstance(content, str):
m = re.search(r"\bstyle[:=]\s*([\w\-]+)", content)
if m and m.group(1) in STYLES:
return m.group(1)
return None
def _seed_value(seed: int) -> int:
return seed if seed > 0 else int(time.time() * 1000) % (2**31)
def _build_txt2img(positive: str, negative: str, settings: dict,
width: int, height: int, seed: int) -> dict:
return {
"3": {"class_type": "KSampler", "inputs": {
"seed": _seed_value(seed),
"steps": settings["steps"], "cfg": settings["cfg"],
"sampler_name": settings["sampler"], "scheduler": settings["scheduler"],
"denoise": 1.0,
"model": ["4", 0], "positive": ["6", 0],
"negative": ["7", 0], "latent_image": ["5", 0],
}},
"4": {"class_type": "CheckpointLoaderSimple", "inputs": {"ckpt_name": settings["ckpt"]}},
"5": {"class_type": "EmptyLatentImage",
"inputs": {"width": width, "height": height, "batch_size": 1}},
"6": {"class_type": "CLIPTextEncode", "inputs": {"text": positive, "clip": ["10", 0]}},
"7": {"class_type": "CLIPTextEncode", "inputs": {"text": negative, "clip": ["10", 0]}},
"8": {"class_type": "VAEDecode", "inputs": {"samples": ["3", 0], "vae": ["4", 2]}},
"9": {"class_type": "SaveImage",
"inputs": {"filename_prefix": "smartpipe", "images": ["8", 0]}},
"10": {"class_type": "CLIPSetLastLayer",
"inputs": {"stop_at_clip_layer": -settings["clip_skip"], "clip": ["4", 1]}},
}
def _build_img2img(positive: str, negative: str, settings: dict,
image_filename: str, denoise: float, seed: int) -> dict:
return {
"3": {"class_type": "KSampler", "inputs": {
"seed": _seed_value(seed),
"steps": settings["steps"], "cfg": settings["cfg"],
"sampler_name": settings["sampler"], "scheduler": settings["scheduler"],
"denoise": denoise,
"model": ["4", 0], "positive": ["6", 0],
"negative": ["7", 0], "latent_image": ["11", 0],
}},
"4": {"class_type": "CheckpointLoaderSimple", "inputs": {"ckpt_name": settings["ckpt"]}},
"6": {"class_type": "CLIPTextEncode", "inputs": {"text": positive, "clip": ["10", 0]}},
"7": {"class_type": "CLIPTextEncode", "inputs": {"text": negative, "clip": ["10", 0]}},
"8": {"class_type": "VAEDecode", "inputs": {"samples": ["3", 0], "vae": ["4", 2]}},
"9": {"class_type": "SaveImage",
"inputs": {"filename_prefix": "smartpipe", "images": ["8", 0]}},
"10": {"class_type": "CLIPSetLastLayer",
"inputs": {"stop_at_clip_layer": -settings["clip_skip"], "clip": ["4", 1]}},
"11": {"class_type": "VAEEncode", "inputs": {"pixels": ["12", 0], "vae": ["4", 2]}},
"12": {"class_type": "LoadImage", "inputs": {"image": image_filename}},
}
def _build_inpaint(positive: str, negative: str, settings: dict,
image_filename: str, mask_text: str,
denoise: float, seed: int) -> dict:
return {
"3": {"class_type": "KSampler", "inputs": {
"seed": _seed_value(seed),
"steps": settings["steps"], "cfg": settings["cfg"],
"sampler_name": settings["sampler"], "scheduler": settings["scheduler"],
"denoise": denoise,
"model": ["4", 0], "positive": ["6", 0],
"negative": ["7", 0], "latent_image": ["13", 0],
}},
"4": {"class_type": "CheckpointLoaderSimple", "inputs": {"ckpt_name": settings["ckpt"]}},
"6": {"class_type": "CLIPTextEncode", "inputs": {"text": positive, "clip": ["10", 0]}},
"7": {"class_type": "CLIPTextEncode", "inputs": {"text": negative, "clip": ["10", 0]}},
"8": {"class_type": "VAEDecode", "inputs": {"samples": ["3", 0], "vae": ["4", 2]}},
"9": {"class_type": "SaveImage",
"inputs": {"filename_prefix": "smartpipe", "images": ["8", 0]}},
"10": {"class_type": "CLIPSetLastLayer",
"inputs": {"stop_at_clip_layer": -settings["clip_skip"], "clip": ["4", 1]}},
"11": {"class_type": "VAEEncode", "inputs": {"pixels": ["12", 0], "vae": ["4", 2]}},
"12": {"class_type": "LoadImage", "inputs": {"image": image_filename}},
"13": {"class_type": "SetLatentNoiseMask",
"inputs": {"samples": ["11", 0], "mask": ["17", 0]}},
"14": {"class_type": "SAMModelLoader (segment anything)",
"inputs": {"model_name": "sam_hq_vit_h (2.57GB)"}},
"15": {"class_type": "GroundingDinoModelLoader (segment anything)",
"inputs": {"model_name": "GroundingDINO_SwinT_OGC (694MB)"}},
"16": {"class_type": "GroundingDinoSAMSegment (segment anything)",
"inputs": {
"sam_model": ["14", 0], "grounding_dino_model": ["15", 0],
"image": ["12", 0], "prompt": mask_text, "threshold": 0.3,
}},
"17": {"class_type": "GrowMask",
"inputs": {"mask": ["16", 1], "expand": 12, "tapered_corners": True}},
}
_FILE_URL_ID_RE = re.compile(r"/(?:api/v1/)?files/([0-9a-fA-F-]{8,})(?:/content)?")
def _file_dict_is_image(f: dict) -> bool:
ftype = (f.get("type") or "").lower()
fname = (f.get("name") or f.get("filename") or "").lower()
return "image" in ftype or fname.endswith((".png", ".jpg", ".jpeg", ".webp"))
async def _read_file_dict(f: dict) -> Optional[bytes]:
for path_key in ("path", "filepath", "file_path"):
path = f.get(path_key)
if path:
try:
with open(path, "rb") as fh:
return fh.read()
except OSError:
pass
candidate_ids = []
if f.get("id"):
candidate_ids.append(f["id"])
url = f.get("url")
if url:
m = _FILE_URL_ID_RE.search(url)
if m:
candidate_ids.append(m.group(1))
if _OPENWEBUI_RUNTIME:
for fid in candidate_ids:
try:
file_model = await Files.get_file_by_id(fid)
if file_model is None:
continue
path = getattr(file_model, "path", None)
if not path:
meta = getattr(file_model, "meta", None) or {}
path = meta.get("path") if isinstance(meta, dict) else getattr(meta, "path", None)
if path:
try:
with open(path, "rb") as fh:
return fh.read()
except OSError:
pass
except Exception:
pass
return None
async def _extract_attached_image(files, messages, metadata, session) -> Optional[bytes]:
# 1. Inline data URIs
for msg in reversed(messages or []):
content = msg.get("content") if isinstance(msg, dict) else None
if isinstance(content, list):
for block in content:
if not isinstance(block, dict) or block.get("type") != "image_url":
continue
url = (block.get("image_url") or {}).get("url", "")
if url.startswith("data:image"):
try:
return base64.b64decode(url.split(",", 1)[1])
except Exception:
pass
# 2. messages[].files
for msg in reversed(messages or []):
if not isinstance(msg, dict):
continue
for f in (msg.get("files") or []):
if isinstance(f, dict) and _file_dict_is_image(f):
data = await _read_file_dict(f)
if data is not None:
return data
# 3. __files__
for f in files or []:
if isinstance(f, dict) and _file_dict_is_image(f):
data = await _read_file_dict(f)
if data is not None:
return data
# 4. DB lookup (assistant-emitted files often only land here)
if _OPENWEBUI_RUNTIME and metadata:
chat_id = metadata.get("chat_id")
if chat_id:
try:
chat = await Chats.get_chat_by_id(chat_id)
chat_data = getattr(chat, "chat", None) if chat else None
chat_messages = (chat_data or {}).get("messages", []) if isinstance(chat_data, dict) else []
for msg in reversed(chat_messages):
for f in (msg.get("files") or []) if isinstance(msg, dict) else []:
if isinstance(f, dict) and _file_dict_is_image(f):
data = await _read_file_dict(f)
if data is not None:
return data
except Exception:
pass
return None
async def _upload_to_comfyui(session, base, raw) -> Optional[str]:
name = f"smartpipe_{uuid.uuid4().hex[:12]}.png"
form = aiohttp.FormData()
form.add_field("image", raw, filename=name, content_type="image/png")
form.add_field("overwrite", "true")
async with session.post(f"{base}/upload/image", data=form) as resp:
if resp.status != 200:
return None
return (await resp.json()).get("name", name)
async def _push_image_to_chat(raw, prefix, request, user_dict, metadata, event_emitter) -> bool:
if not (_OPENWEBUI_RUNTIME and request and user_dict and event_emitter):
return False
try:
user = await Users.get_user_by_id(user_dict.get("id"))
if not user:
return False
upload = UploadFile(
file=io.BytesIO(raw),
filename=f"{prefix}_{uuid.uuid4().hex[:8]}.png",
headers={"content-type": "image/png"},
)
result = upload_file_handler(
request=request, file=upload,
metadata={"chat_id": (metadata or {}).get("chat_id"),
"message_id": (metadata or {}).get("message_id")},
process=False, user=user,
)
file_item = await result if inspect.iscoroutine(result) else result
url = request.app.url_path_for("get_file_content_by_id", id=file_item.id)
await event_emitter({
"type": "files",
"data": {"files": [{"type": "image", "url": url}]},
})
return True
except Exception:
return False
async def _submit_and_fetch(session, base, workflow, timeout_seconds, emit, settings):
SAVE_NODE_ID = "9"
client_id = str(uuid.uuid4())
async with session.post(
f"{base}/prompt", json={"prompt": workflow, "client_id": client_id}
) as resp:
if resp.status != 200:
return None, f"ComfyUI rejected the prompt: {resp.status} {await resp.text()}"
prompt_id = (await resp.json()).get("prompt_id")
if not prompt_id:
return None, "ComfyUI didn't return a prompt_id."
await emit(
f"Sampling — {settings['sampler']}/{settings['scheduler']}, "
f"CFG {settings['cfg']}, {settings['steps']} steps"
)
deadline = time.time() + timeout_seconds
output_images: list = []
while time.time() < deadline:
await asyncio.sleep(1.5)
async with session.get(f"{base}/history/{prompt_id}") as resp:
if resp.status != 200:
continue
history = await resp.json()
if prompt_id in history:
outputs = history[prompt_id].get("outputs", {}) or {}
save_imgs = (outputs.get(SAVE_NODE_ID) or {}).get("images", [])
if save_imgs:
output_images.extend(save_imgs)
if not output_images:
for node_out in outputs.values():
output_images.extend(node_out.get("images", []))
if output_images:
break
if not output_images:
return None, f"Timed out after {timeout_seconds}s waiting for image."
img = output_images[0]
params = {
"filename": img["filename"],
"subfolder": img.get("subfolder", ""),
"type": img.get("type", "output"),
}
async with session.get(f"{base}/view", params=params) as resp:
if resp.status != 200:
return None, f"Failed to fetch image: {resp.status}"
return await resp.read(), None
def _extract_user_text(body: dict) -> str:
"""Pull the latest user message's text content."""
messages = body.get("messages", [])
for msg in reversed(messages):
if not isinstance(msg, dict) or msg.get("role") != "user":
continue
content = msg.get("content")
if isinstance(content, str):
return content.strip()
if isinstance(content, list):
parts = []
for block in content:
if isinstance(block, dict) and block.get("type") == "text":
parts.append(block.get("text", ""))
return " ".join(parts).strip()
return ""
class Pipe:
class Valves(BaseModel):
COMFYUI_BASE_URL: str = Field(
default="http://comfyui:8188",
description="ComfyUI server URL reachable from the open-webui container.",
)
TIMEOUT_SECONDS: int = Field(default=600)
DEFAULT_WIDTH: int = Field(default=1024)
DEFAULT_HEIGHT: int = Field(default=1024)
DEFAULT_DENOISE_IMG2IMG: float = Field(default=0.7)
DEFAULT_DENOISE_INPAINT: float = Field(default=1.0)
FORCE_STYLE: str = Field(
default="",
description="Override style routing. Empty = auto-route. Set to "
"one of: photo, juggernaut, pony, general, "
"furry-nai, furry-noob, furry-il.",
)
def __init__(self):
self.valves = self.Valves()
self.id = "image-studio-pipe"
self.name = "Image Studio (Pipe)"
async def pipe(
self,
body: dict,
__user__: Optional[dict] = None,
__request__=None,
__metadata__: Optional[dict] = None,
__event_emitter__: Optional[Callable[[dict], Awaitable[None]]] = None,
) -> str:
user_text = _extract_user_text(body)
if not user_text:
return "Type a message describing the image you want."
async def emit(msg: str, done: bool = False):
if __event_emitter__:
await __event_emitter__({
"type": "status",
"data": {"description": msg, "done": done},
})
# Style: explicit valve override > inherited from prior assistant
# message > keyword detection on user text > default.
chosen = (
self.valves.FORCE_STYLE.strip()
or _inherited_style(body.get("messages"))
or _route_style(user_text)
)
if chosen not in STYLES:
chosen = DEFAULT_STYLE
settings = STYLES[chosen]
base = self.valves.COMFYUI_BASE_URL.rstrip("/")
positive = f"{settings['prefix']}{user_text}"
negative = settings["negative"]
async with aiohttp.ClientSession() as session:
await emit("Looking for attached image…")
source_bytes = await _extract_attached_image(
None, body.get("messages"), __metadata__, session,
)
if source_bytes is None:
# No image → txt2img
await emit(f"Generating ({chosen})")
workflow = _build_txt2img(
positive, negative, settings,
self.valves.DEFAULT_WIDTH, self.valves.DEFAULT_HEIGHT, 0,
)
tag = "gen"
else:
# Image present → upload, then inpaint or img2img
uploaded = await _upload_to_comfyui(session, base, source_bytes)
if not uploaded:
return "Failed to upload source image to ComfyUI."
mask_text = _detect_mask_text(user_text)
if mask_text:
await emit(
f"Inpainting ({chosen}, mask='{mask_text}', "
f"denoise={self.valves.DEFAULT_DENOISE_INPAINT})"
)
workflow = _build_inpaint(
positive, negative, settings, uploaded, mask_text,
self.valves.DEFAULT_DENOISE_INPAINT, 0,
)
tag = f"edit (inpaint: {mask_text})"
else:
await emit(
f"Editing ({chosen}, "
f"denoise={self.valves.DEFAULT_DENOISE_IMG2IMG})"
)
workflow = _build_img2img(
positive, negative, settings, uploaded,
self.valves.DEFAULT_DENOISE_IMG2IMG, 0,
)
tag = "edit (img2img)"
raw, err = await _submit_and_fetch(
session, base, workflow, self.valves.TIMEOUT_SECONDS, emit, settings,
)
if err:
return err
await _push_image_to_chat(
raw, "smartpipe", __request__, __user__, __metadata__, __event_emitter__,
)
await emit(f"Done — {chosen}", done=True)
# Single-line plain-English follow-up. Emit the style as
# "style: <name>" so the inheritance helper can find it next turn.
return f"Done — style: {chosen}, {tag}."