Files
William Gill c07e962cae Image tools: migrate to OWUI 0.9.0 async model accessors
Open WebUI 0.9.0 made every model-class accessor (Users.get_user_by_id,
Chats.get_chat_by_id, Files.get_file_by_id, …) a coroutine. Both tools
were still calling them synchronously, so the calls returned coroutines
instead of model objects; the first downstream attribute access threw,
the bare `except Exception: return False` swallowed it, and uploads
silently fell through to the data-URI fallback. The data-URI markdown
rendered during streaming but didn't survive post-stream commit, which
looked like "image flashes in, then disappears."

Add await to the six call sites; promote `_read_file_dict` to async
since it now contains an await; restore `_push_image_to_chat` to the
canonical `files` event so the file-attachment chrome (thumbnail +
download) comes back.

This supersedes commit d034700, which mis-diagnosed the symptom as a
virtualization regression and switched to a `message`-event markdown
workaround. The workaround didn't help (same flash-and-vanish) because
the upload pre-check still failed for the same async-migration reason
and the data-URI fallback path still ran.

smart_image_gen.py 0.7.9 -> 0.7.10
smart_image_pipe.py 0.1.1 -> 0.1.2

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-26 06:16:02 -05:00

1025 lines
44 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
title: Smart Image Generator & Editor (ComfyUI)
author: ai-stack
version: 0.7.10
description: Generate or edit images via ComfyUI with automatic SDXL
checkpoint routing. Two methods — generate_image (txt2img) and
edit_image (img2img on the user's most recently attached image). The
LLM picks (or auto-detects) the right model — photoreal, Pony
score-tag, NoobAI/Illustrious furry, etc. — and each style ships
with the creator-recommended sampler, scheduler, CFG, steps, CLIP
skip, prompt-prefix dialect, and negatives. The image is uploaded
to Open WebUI's file store and surfaced via a `files` event (the
canonical pattern used by Open WebUI's own image-gen path); the
function return is a short confirmation so the LLM doesn't try to
describe or re-emit the image.
required_open_webui_version: 0.5.0
"""
import asyncio
import base64
import inspect
import io
import json
import re
import time
import uuid
from typing import Awaitable, Callable, Literal, Optional
import aiohttp
from pydantic import BaseModel, Field
# Open WebUI's runtime — only available when the tool is loaded inside the
# Open WebUI process. Guarded so the module still imports for standalone
# linting/testing; if the imports fail at runtime, _push_image_to_chat
# falls back to emitting a markdown data-URI message.
try:
from fastapi import UploadFile
from open_webui.models.chats import Chats
from open_webui.models.files import Files
from open_webui.models.users import Users
from open_webui.routers.files import upload_file_handler
_OPENWEBUI_RUNTIME = True
except ImportError:
_OPENWEBUI_RUNTIME = False
StyleName = Literal[
"photo", "juggernaut", "pony", "general",
"furry-nai", "furry-noob", "furry-il",
]
# ─────────────────────────────────────────────────────────────────────────────
# Per-style settings — sampler/scheduler/cfg/steps/clip_skip/prefix/negatives
# come from each model's creator page on Civitai. Three prefix dialects in
# play: photoreal (no prefix, natural language), Pony score chain (REQUIRED
# for any Pony-derived checkpoint), and Booru quality tags (NoobAI /
# Illustrious lineage). Never cross-contaminate.
# ─────────────────────────────────────────────────────────────────────────────
STYLES = {
"photo": {
"ckpt": "CyberRealisticXLPlay_V8.0_FP16.safetensors",
"sampler": "dpmpp_2m_sde",
"scheduler": "karras",
"cfg": 4.0,
"steps": 28,
"clip_skip": 1,
"prefix": "", # natural language only — no quality tags
"negative": (
"cartoon, drawing, illustration, anime, manga, painting, sketch, "
"render, 3d, cgi, watercolor, plastic skin, doll-like, oversaturated, "
"lowres, blurry, jpeg artifacts, noisy, grainy, low quality, worst quality, "
"bad anatomy, deformed, mutated, extra limbs, extra fingers, missing fingers, "
"fused fingers, malformed hands, asymmetric face, "
"watermark, signature, text, logo, label, username"
),
},
"juggernaut": {
"ckpt": "Juggernaut-XL_v9_RunDiffusionPhoto_v2.safetensors",
"sampler": "dpmpp_2m_sde",
"scheduler": "karras",
"cfg": 4.5,
"steps": 35,
"clip_skip": 1,
"prefix": "", # natural language only
"negative": (
"cartoon, drawing, illustration, anime, manga, painting, sketch, "
"render, 3d, cgi, plastic skin, washed out, oversaturated, "
"lowres, blurry, jpeg artifacts, low quality, worst quality, "
"bad anatomy, deformed, mutated, extra limbs, extra fingers, missing fingers, "
"fused fingers, malformed hands, "
"watermark, signature, text, logo, username"
),
},
"pony": {
"ckpt": "ponyDiffusionV6XL_v6StartWithThisOne.safetensors",
"sampler": "euler_ancestral",
"scheduler": "normal",
"cfg": 7.5,
"steps": 25,
"clip_skip": 2,
# REQUIRED — the full chain. Just `score_9` alone is much weaker.
"prefix": "score_9, score_8_up, score_7_up, score_6_up, score_5_up, score_4_up, ",
# Pony's creator notes negatives are usually unnecessary; conservative
# baseline only. Source-toggle tags (source_pony/furry/anime/cartoon)
# are intentionally omitted — they exclude entire content domains.
"negative": (
"score_6, score_5, score_4, "
"worst quality, low quality, lowres, blurry, jpeg artifacts, noisy, "
"bad anatomy, bad proportions, bad hands, extra digit, fewer digits, "
"fused fingers, malformed limbs, deformed, ugly, "
"censored, monochrome, "
"watermark, signature, text, logo, artist name, patreon username, twitter username"
),
},
"general": {
"ckpt": "talmendoxlSDXL_v11Beta.safetensors",
"sampler": "dpmpp_2m",
"scheduler": "karras",
"cfg": 8.0, # Talmendo wants notably higher CFG than the others
"steps": 30,
"clip_skip": 2,
"prefix": "", # creator says don't push "masterpiece" — fights the amateur aesthetic
"negative": (
"lowres, blurry, jpeg artifacts, noisy, grainy, low quality, worst quality, "
"bad anatomy, deformed, mutated, extra limbs, missing fingers, fused fingers, "
"malformed hands, ugly, "
"watermark, signature, text, logo"
),
},
"furry-nai": {
"ckpt": "reedFURRYMixSDXL_v23nai.safetensors",
"sampler": "euler_ancestral",
"scheduler": "normal",
"cfg": 5.0,
"steps": 30,
"clip_skip": 2,
"prefix": (
"masterpiece, best quality, high quality, good quality, "
"detailed eyes, highres, absurdres, incredibly absurdres, "
),
"negative": (
"worst quality, bad_quality, normal quality, lowres, anatomical nonsense, "
"bad anatomy, anatomical nonsense, interlocked fingers, extra fingers, "
"watermark, simple background, transparent, bad_feet, bad_hands, "
"logo, text, bad_anatomy, signature, face backlighting, "
"(worst quality, bad quality:1.2), jpeg artifacts, censored, "
"extra digit, ugly, deformed anatomy, bad proportions, "
),
},
"furry-noob": {
"ckpt": "indigoVoidFurryFusedXL_noobaiV32.safetensors",
"sampler": "euler_ancestral", # creator: other samplers won't work
"scheduler": "normal",
"cfg": 4.5,
"steps": 20,
"clip_skip": 2,
"prefix": (
"masterpiece, best quality, perfect quality, absurdres, newest, "
"very aesthetic, vibrant colors, "
),
"negative": (
"human, realistic, photorealistic, 3d, cgi, "
"shiny skin, shiny clothing, "
"worst quality, low quality, lowres, blurry, jpeg artifacts, noisy, "
"bad anatomy, bad hands, mutated hands, bad proportions, "
"extra digit, fewer digits, fused fingers, malformed limbs, deformed, ugly, "
"watermark, signature, text, logo, username, artist signature"
),
},
"furry-il": {
"ckpt": "novaFurryXL_ilV170.safetensors",
"sampler": "euler_ancestral",
"scheduler": "normal",
"cfg": 4.0,
"steps": 30,
"clip_skip": 2,
# Illustrious wants `newest` in positive and `old`/`oldest` in negative
# — these are year-bucket tags from the training set. `furry` and
# `anthro` are universally helpful here.
"prefix": (
"masterpiece, best quality, amazing quality, very aesthetic, "
"high resolution, ultra-detailed, absurdres, newest, furry, anthro, "
),
"negative": (
"human, multiple tails, modern, recent, old, oldest, "
"graphic, cartoon, painting, crayon, graphite, abstract, glitch, "
"deformed, mutated, ugly, disfigured, long body, conjoined, "
"lowres, bad anatomy, bad hands, missing fingers, extra digits, fewer digits, "
"cropped, very displeasing, worst quality, bad quality, sketch, "
"jpeg artifacts, signature, watermark, username, text, simple background, "
"bad ai-generated"
),
},
}
DEFAULT_STYLE = "general"
# First-match-wins keyword router used when the caller didn't pick a style.
# Order matters — narrower patterns above broader ones.
ROUTING_RULES = [
# Pony score chain is the single strongest signal — Pony only
(re.compile(r"\bscore_\d", re.I), "pony"),
(re.compile(r"\bpony\b", re.I), "pony"),
# NoobAI / Illustrious explicit mentions
(re.compile(r"\b(noobai|noob)\b", re.I), "furry-noob"),
(re.compile(r"\b(illustrious|ilxl)\b", re.I), "furry-il"),
# Generic furry — defaults to NovaFurry (Illustrious lineage, current sweet spot)
(re.compile(r"\b(furry|anthro|feral|kemono|fursona|species)\b", re.I), "furry-il"),
# Photo / photoreal
(re.compile(r"\b(juggernaut)\b", re.I), "juggernaut"),
(re.compile(r"\b(photo|photograph|realistic|portrait|selfie|cinematic)\b", re.I), "photo"),
# Generic anime / illustration → Pony covers anime well
(re.compile(r"\b(anime|manga|2d|illustration)\b", re.I), "pony"),
]
def _route_style(prompt: str) -> str:
for pattern, style in ROUTING_RULES:
if pattern.search(prompt):
return style
return DEFAULT_STYLE
def _inherited_style(messages: Optional[list]) -> Optional[str]:
"""
Return the `style` arg from the most recent generate_image /
edit_image tool call in the conversation. Used so edit_image can
auto-inherit the style of the image being edited when the LLM
didn't pass one explicitly — without this, an edit on a furry
image with a neutral edit prompt ("make the eyes glow") falls
through to the keyword router and picks a wrong style.
"""
if not messages:
return None
for msg in reversed(messages):
if not isinstance(msg, dict):
continue
for tc in (msg.get("tool_calls") or []):
if not isinstance(tc, dict):
continue
fn = tc.get("function") or {}
if fn.get("name") not in ("generate_image", "edit_image"):
continue
raw_args = fn.get("arguments")
if isinstance(raw_args, str):
try:
args = json.loads(raw_args)
except (TypeError, ValueError):
args = {}
elif isinstance(raw_args, dict):
args = raw_args
else:
args = {}
style = args.get("style")
if isinstance(style, str) and style in STYLES:
return style
return None
def _seed_value(seed: int) -> int:
return seed if seed > 0 else int(time.time() * 1000) % (2**31)
def _job_prefix(kind: str) -> str:
"""Per-submission filename_prefix so SaveImage outputs from concurrent
jobs can never share an auto-numbered counter and cross over."""
return f"{kind}_{uuid.uuid4().hex[:10]}"
def _build_txt2img(positive: str, negative: str, settings: dict,
width: int, height: int, seed: int) -> dict:
"""
SDXL txt2img workflow. CLIP skip via CLIPSetLastLayer so the same graph
handles skip 1 (-1) and skip 2 (-2).
"""
return {
"3": {"class_type": "KSampler", "inputs": {
"seed": _seed_value(seed),
"steps": settings["steps"], "cfg": settings["cfg"],
"sampler_name": settings["sampler"], "scheduler": settings["scheduler"],
"denoise": 1.0,
"model": ["4", 0], "positive": ["6", 0],
"negative": ["7", 0], "latent_image": ["5", 0],
}},
"4": {"class_type": "CheckpointLoaderSimple",
"inputs": {"ckpt_name": settings["ckpt"]}},
"5": {"class_type": "EmptyLatentImage",
"inputs": {"width": width, "height": height, "batch_size": 1}},
"6": {"class_type": "CLIPTextEncode", "inputs": {"text": positive, "clip": ["10", 0]}},
"7": {"class_type": "CLIPTextEncode", "inputs": {"text": negative, "clip": ["10", 0]}},
"8": {"class_type": "VAEDecode", "inputs": {"samples": ["3", 0], "vae": ["4", 2]}},
"9": {"class_type": "SaveImage",
"inputs": {"filename_prefix": _job_prefix("smartgen"), "images": ["8", 0]}},
"10": {"class_type": "CLIPSetLastLayer",
"inputs": {"stop_at_clip_layer": -settings["clip_skip"],
"clip": ["4", 1]}},
}
def _build_inpaint(positive: str, negative: str, settings: dict,
image_filename: str, mask_text: str,
denoise: float, seed: int) -> dict:
"""
SDXL inpainting workflow with text-driven masking. Uses
comfyui_segment_anything (GroundingDINO + SAM-HQ — installed by the
Dockerfile) to derive a mask from `mask_text` (a noun phrase like
"the dog's collar"), then SetLatentNoiseMask + KSampler repaint
only that region. Everything outside the mask stays pixel-perfect.
The raw SAM mask is run through GrowMask with tapered_corners
before it reaches the sampler. Without that, the mask edge is
pixel-binary and KSampler repaints right up to a hard boundary —
SDXL has no surrounding-pixel context inside the mask to blend
with, so the inpainted region looks pasted-on with visible seams.
expand=12px + taper gives a soft transition that blends naturally.
First inpaint downloads ~3 GB of SAM/GroundingDINO weights into
/opt/comfyui/models/{sams,grounding-dino}/ — subsequent runs reuse
them.
"""
return {
"3": {"class_type": "KSampler", "inputs": {
"seed": _seed_value(seed),
"steps": settings["steps"], "cfg": settings["cfg"],
"sampler_name": settings["sampler"], "scheduler": settings["scheduler"],
"denoise": denoise,
"model": ["4", 0], "positive": ["6", 0],
"negative": ["7", 0], "latent_image": ["13", 0],
}},
"4": {"class_type": "CheckpointLoaderSimple",
"inputs": {"ckpt_name": settings["ckpt"]}},
"6": {"class_type": "CLIPTextEncode", "inputs": {"text": positive, "clip": ["10", 0]}},
"7": {"class_type": "CLIPTextEncode", "inputs": {"text": negative, "clip": ["10", 0]}},
"8": {"class_type": "VAEDecode", "inputs": {"samples": ["3", 0], "vae": ["4", 2]}},
"9": {"class_type": "SaveImage",
"inputs": {"filename_prefix": _job_prefix("smartinpaint"), "images": ["8", 0]}},
"10": {"class_type": "CLIPSetLastLayer",
"inputs": {"stop_at_clip_layer": -settings["clip_skip"],
"clip": ["4", 1]}},
"11": {"class_type": "VAEEncode", "inputs": {"pixels": ["12", 0], "vae": ["4", 2]}},
"12": {"class_type": "LoadImage", "inputs": {"image": image_filename}},
"13": {"class_type": "SetLatentNoiseMask",
"inputs": {"samples": ["11", 0], "mask": ["17", 0]}},
"14": {"class_type": "SAMModelLoader (segment anything)",
"inputs": {"model_name": "sam_hq_vit_h (2.57GB)"}},
"15": {"class_type": "GroundingDinoModelLoader (segment anything)",
"inputs": {"model_name": "GroundingDINO_SwinT_OGC (694MB)"}},
"16": {"class_type": "GroundingDinoSAMSegment (segment anything)",
"inputs": {
"sam_model": ["14", 0],
"grounding_dino_model": ["15", 0],
"image": ["12", 0],
"prompt": mask_text,
"threshold": 0.3,
}},
"17": {"class_type": "GrowMask",
"inputs": {
"mask": ["16", 1],
"expand": 12,
"tapered_corners": True,
}},
}
def _build_img2img(positive: str, negative: str, settings: dict,
image_filename: str, denoise: float, seed: int) -> dict:
"""
SDXL img2img workflow. Loads `image_filename` (already uploaded to
ComfyUI's /input/), VAE-encodes it to latent, and feeds that into the
sampler at the requested denoise. Resolution is whatever the source
image is — no resize.
"""
return {
"3": {"class_type": "KSampler", "inputs": {
"seed": _seed_value(seed),
"steps": settings["steps"], "cfg": settings["cfg"],
"sampler_name": settings["sampler"], "scheduler": settings["scheduler"],
"denoise": denoise,
"model": ["4", 0], "positive": ["6", 0],
"negative": ["7", 0], "latent_image": ["11", 0],
}},
"4": {"class_type": "CheckpointLoaderSimple",
"inputs": {"ckpt_name": settings["ckpt"]}},
"6": {"class_type": "CLIPTextEncode", "inputs": {"text": positive, "clip": ["10", 0]}},
"7": {"class_type": "CLIPTextEncode", "inputs": {"text": negative, "clip": ["10", 0]}},
"8": {"class_type": "VAEDecode", "inputs": {"samples": ["3", 0], "vae": ["4", 2]}},
"9": {"class_type": "SaveImage",
"inputs": {"filename_prefix": _job_prefix("smartedit"), "images": ["8", 0]}},
"10": {"class_type": "CLIPSetLastLayer",
"inputs": {"stop_at_clip_layer": -settings["clip_skip"],
"clip": ["4", 1]}},
"11": {"class_type": "VAEEncode", "inputs": {"pixels": ["12", 0], "vae": ["4", 2]}},
"12": {"class_type": "LoadImage", "inputs": {"image": image_filename}},
}
def _file_dict_is_image(f: dict) -> bool:
ftype = (f.get("type") or "").lower()
fname = (f.get("name") or f.get("filename") or "").lower()
return "image" in ftype or fname.endswith((".png", ".jpg", ".jpeg", ".webp"))
_FILE_URL_ID_RE = re.compile(r"/(?:api/v1/)?files/([0-9a-fA-F-]{8,})(?:/content)?")
async def _read_file_dict(f: dict) -> Optional[bytes]:
"""
Try to read raw bytes for one file dict. Tries in order:
1. Local filesystem path keys (covers user uploads with `path`).
2. Open WebUI's Files.get_file_by_id with f["id"] (covers files
the user uploaded via the file API).
3. Same lookup with the id parsed out of f["url"] (covers
assistant-emitted files where the message attachment is just
{"type":"image","url":"/api/v1/files/<uuid>/content"} —
no id field, no path field, but the URL has the id).
Async because Open WebUI 0.9.0 made every model-class accessor
a coroutine (Users / Chats / Files / etc.). Calling the sync
way returns a coroutine object instead of the model — silently
breaks downstream attribute access. Same reason the callers in
_extract_attached_image and _push_image_to_chat must await.
"""
for path_key in ("path", "filepath", "file_path"):
path = f.get(path_key)
if path:
try:
with open(path, "rb") as fh:
return fh.read()
except OSError:
pass
candidate_ids = []
if f.get("id"):
candidate_ids.append(f["id"])
url = f.get("url")
if url:
m = _FILE_URL_ID_RE.search(url)
if m:
candidate_ids.append(m.group(1))
if _OPENWEBUI_RUNTIME:
for fid in candidate_ids:
try:
file_model = await Files.get_file_by_id(fid)
if file_model is None:
continue
path = getattr(file_model, "path", None)
if not path:
meta = getattr(file_model, "meta", None) or {}
if isinstance(meta, dict):
path = meta.get("path")
else:
path = getattr(meta, "path", None)
if path:
try:
with open(path, "rb") as fh:
return fh.read()
except OSError:
pass
except Exception:
pass
return None
async def _extract_attached_image(
files: Optional[list],
messages: Optional[list],
metadata: Optional[dict],
session: aiohttp.ClientSession,
) -> Optional[bytes]:
"""
Find the most recent image in the chat — including images previously
emitted by this tool itself. Search order (most recent first):
1. Inline base64 data URIs in `image_url` content blocks of recent
messages (vision-model uploads, paste-from-clipboard).
2. Files attached to messages in the conversation, scanned in
REVERSE so the newest image wins. This covers two cases:
a. Files the user just attached (current user message).
b. Files the assistant emitted via prior `generate_image` /
`edit_image` calls (attached to assistant messages by the
`files` event in _push_image_to_chat).
3. The __files__ tool param as a final fallback (some Open WebUI
versions pass user uploads here instead of on the message).
4. Best-effort URL fetch on any leftover file dict (likely fails
on auth-protected endpoints — last resort).
"""
# 1. Inline data URIs on recent messages.
for msg in reversed(messages or []):
content = msg.get("content") if isinstance(msg, dict) else None
if isinstance(content, list):
for block in content:
if not isinstance(block, dict) or block.get("type") != "image_url":
continue
url = (block.get("image_url") or {}).get("url", "")
if url.startswith("data:image"):
try:
return base64.b64decode(url.split(",", 1)[1])
except Exception:
pass
# 2. Files on messages, newest first.
for msg in reversed(messages or []):
if not isinstance(msg, dict):
continue
msg_files = msg.get("files")
if not isinstance(msg_files, list):
continue
for f in msg_files:
if not isinstance(f, dict) or not _file_dict_is_image(f):
continue
data = await _read_file_dict(f)
if data is not None:
return data
# 3. __files__ param (current user upload, sometimes only here).
for f in files or []:
if not isinstance(f, dict) or not _file_dict_is_image(f):
continue
data = await _read_file_dict(f)
if data is not None:
return data
# 4. Pull the chat from the database directly. Open WebUI persists
# `files` on every message via the upsert in socket/main.py — so even
# if __messages__ doesn't hydrate the assistant-emitted attachments,
# the chat record does. This is the strongest fallback.
if _OPENWEBUI_RUNTIME and metadata:
chat_id = metadata.get("chat_id")
if chat_id:
try:
chat = await Chats.get_chat_by_id(chat_id)
chat_data = getattr(chat, "chat", None) if chat else None
chat_messages = (chat_data or {}).get("messages", []) if isinstance(chat_data, dict) else []
for msg in reversed(chat_messages):
if not isinstance(msg, dict):
continue
msg_files = msg.get("files") or []
for f in msg_files:
if not isinstance(f, dict) or not _file_dict_is_image(f):
continue
data = await _read_file_dict(f)
if data is not None:
return data
except Exception:
pass
# 5. Last-resort URL fetch (no auth — only works for public endpoints).
for source in [files or []] + [
(msg.get("files") or []) for msg in reversed(messages or []) if isinstance(msg, dict)
]:
for f in source:
if not isinstance(f, dict) or not _file_dict_is_image(f):
continue
url = f.get("url")
if not url:
continue
full = url if url.startswith("http") else f"http://localhost:8080{url}"
try:
async with session.get(full) as resp:
if resp.status == 200:
return await resp.read()
except aiohttp.ClientError:
pass
return None
async def _upload_to_comfyui(
session: aiohttp.ClientSession, base: str, raw: bytes
) -> Optional[str]:
"""POST raw bytes to ComfyUI /upload/image and return the saved name."""
name = f"smartedit_{uuid.uuid4().hex[:12]}.png"
form = aiohttp.FormData()
form.add_field("image", raw, filename=name, content_type="image/png")
form.add_field("overwrite", "true")
async with session.post(f"{base}/upload/image", data=form) as resp:
if resp.status != 200:
return None
return (await resp.json()).get("name", name)
async def _push_image_to_chat(
raw: bytes,
filename_prefix: str,
request,
user_dict: Optional[dict],
metadata: Optional[dict],
event_emitter: Optional[Callable[[dict], Awaitable[None]]],
) -> bool:
"""
Surface a generated image in the chat using Open WebUI's canonical
pattern: upload the bytes via the internal file store, then emit a
`files` event referencing the served URL. This is the same path Open
WebUI's own image-generation code uses (utils/middleware.py ~1325).
Returns True if the image was uploaded and emitted via the files
event. Returns False if anything is missing — caller should fall
back to a data-URI markdown message in that case.
"""
if not (_OPENWEBUI_RUNTIME and request and user_dict and event_emitter):
return False
try:
user = await Users.get_user_by_id(user_dict.get("id"))
if not user:
return False
upload = UploadFile(
file=io.BytesIO(raw),
filename=f"{filename_prefix}_{uuid.uuid4().hex[:8]}.png",
headers={"content-type": "image/png"},
)
meta = metadata or {}
result = upload_file_handler(
request=request,
file=upload,
metadata={
"chat_id": meta.get("chat_id"),
"message_id": meta.get("message_id"),
},
process=False,
user=user,
)
# upload_file_handler may be sync or async depending on the Open
# WebUI version — handle either.
if inspect.iscoroutine(result):
file_item = await result
else:
file_item = result
url = request.app.url_path_for(
"get_file_content_by_id", id=file_item.id
)
await event_emitter({
"type": "files",
"data": {"files": [{"type": "image", "url": url}]},
})
return True
except Exception:
# Any failure (signature drift, missing route, etc.) falls back
# to the data-URI path in the caller.
return False
async def _submit_and_fetch(
session: aiohttp.ClientSession,
base: str,
workflow: dict,
timeout_seconds: int,
emit: Callable[[str, bool], Awaitable[None]],
settings: dict,
) -> tuple[Optional[bytes], Optional[str]]:
"""Submit a workflow, poll history, fetch the first output image. Returns
(image_bytes, error_message)."""
client_id = str(uuid.uuid4())
async with session.post(
f"{base}/prompt", json={"prompt": workflow, "client_id": client_id}
) as resp:
if resp.status != 200:
return None, f"ComfyUI rejected the prompt: {resp.status} {await resp.text()}"
prompt_id = (await resp.json()).get("prompt_id")
if not prompt_id:
return None, "ComfyUI didn't return a prompt_id."
await emit(
f"Sampling — {settings['sampler']}/{settings['scheduler']}, "
f"CFG {settings['cfg']}, {settings['steps']} steps", False
)
# The SaveImage node in every workflow this tool builds is id "9".
# We prefer it explicitly because intermediate nodes (e.g. the
# GroundingDinoSAMSegment IMAGE output in the inpaint workflow) can
# land in the outputs dict too, and dict iteration order is not
# stable across runs — without preferring "9" we sometimes returned
# an overlay or masked-only image that rendered mostly black.
SAVE_NODE_ID = "9"
deadline = time.time() + timeout_seconds
output_images: list = []
while time.time() < deadline:
await asyncio.sleep(1.5)
async with session.get(f"{base}/history/{prompt_id}") as resp:
if resp.status != 200:
continue
history = await resp.json()
if prompt_id in history:
outputs = history[prompt_id].get("outputs", {}) or {}
# Prefer the canonical SaveImage output …
save_imgs = (outputs.get(SAVE_NODE_ID) or {}).get("images", [])
if save_imgs:
output_images.extend(save_imgs)
# … only fall back to other nodes if SaveImage didn't fire
# (workflow drift, manual override, etc.)
if not output_images:
for node_out in outputs.values():
output_images.extend(node_out.get("images", []))
if output_images:
break
if not output_images:
return None, f"Timed out after {timeout_seconds}s waiting for image."
img = output_images[0]
params = {
"filename": img["filename"],
"subfolder": img.get("subfolder", ""),
"type": img.get("type", "output"),
}
async with session.get(f"{base}/view", params=params) as resp:
if resp.status != 200:
return None, f"Failed to fetch image: {resp.status}"
return await resp.read(), None
class Tools:
class Valves(BaseModel):
COMFYUI_BASE_URL: str = Field(
default="http://comfyui:8188",
description="ComfyUI server URL reachable from the open-webui container.",
)
TIMEOUT_SECONDS: int = Field(
default=600,
description=(
"Maximum wait for a single generation to complete. "
"Default 10 minutes — long enough to absorb a first-time "
"inpaint where SAM-HQ + GroundingDINO + BERT auto-download "
"(~3 GB). Steady-state runs finish in well under a minute; "
"if your KSampler routinely takes longer than that, lower "
"the per-style steps in STYLES."
),
)
def __init__(self):
self.valves = self.Valves()
async def generate_image(
self,
prompt: str,
style: Optional[StyleName] = None,
negative_prompt: Optional[str] = None,
width: int = 1024,
height: int = 1024,
seed: int = 0,
__request__=None,
__user__: Optional[dict] = None,
__metadata__: Optional[dict] = None,
__event_emitter__: Optional[Callable[[dict], Awaitable[None]]] = None,
) -> str:
"""
Create a NEW image from scratch and show it to the user. Use this
whenever the user asks you to draw, generate, create, make, paint,
render, or imagine any visual content — photographs, portraits,
characters, scenes, illustrations, anime, drawings — and they have
NOT attached an existing image. If they did attach an image and
want it modified, use edit_image instead.
Pick `style` to match what the user wants:
- "photo" — photorealistic photographs, portraits, cinematic shots.
- "juggernaut" — alternate photoreal style (sharper, more saturated).
- "pony" — anime / illustration / cartoon (Pony Diffusion).
- "general" — fallback for anything that doesn't fit the others.
- "furry-nai" — anthropomorphic characters (NAI-trained mix).
- "furry-noob" — anthropomorphic characters (NoobAI base).
- "furry-il" — anthropomorphic characters (Illustrious base, default
for any "furry" / "anthro" request unless specified otherwise).
Each style auto-prepends the right quality tags and picks the right
sampler / CFG / steps / CLIP skip. Do NOT add tags like
"masterpiece" or "score_9" to `prompt` yourself; the tool handles
that.
:param prompt: Plain description of the image (subject, scene,
style notes, lighting, etc.). No quality tags.
:param style: One of the values above. Omit to auto-detect.
:param negative_prompt: Extra terms to exclude. Usually unneeded.
:param width: Pixels (default 1024 — SDXL native). For portraits
use 832 with height 1216; for landscapes 1216 with height 832.
:param height: Pixels (default 1024).
:param seed: 0 to randomize, otherwise a specific seed for repeats.
:return: Markdown image of the result.
"""
chosen = style or _route_style(prompt)
settings = STYLES.get(chosen)
if not settings:
return f"Unknown style '{chosen}'. Available: {', '.join(STYLES.keys())}"
async def emit(msg: str, done: bool = False):
if __event_emitter__:
await __event_emitter__({
"type": "status",
"data": {"description": msg, "done": done},
})
await emit(f"Routing to {chosen} ({settings['ckpt']})")
positive = f"{settings['prefix']}{prompt}"
negative = settings["negative"]
if negative_prompt:
negative = f"{negative}, {negative_prompt}"
workflow = _build_txt2img(positive, negative, settings, width, height, seed)
base = self.valves.COMFYUI_BASE_URL.rstrip("/")
async with aiohttp.ClientSession() as session:
raw, err = await _submit_and_fetch(
session, base, workflow, self.valves.TIMEOUT_SECONDS, emit, settings,
)
if err:
return err
# Surface the image in the chat. Preferred path uploads to Open
# WebUI's file store and emits a `files` event (matches the built-
# in image-gen flow). Fallback inlines a data-URI markdown via a
# `message` event for environments where the file API isn't
# reachable from the tool process.
pushed = await _push_image_to_chat(
raw, "smartgen", __request__, __user__, __metadata__, __event_emitter__,
)
if not pushed and __event_emitter__:
b64 = base64.b64encode(raw).decode("ascii")
await __event_emitter__({
"type": "message",
"data": {"content": f"![{chosen}](data:image/png;base64,{b64})"},
})
await emit(f"Done — {chosen}", done=True)
return (
f"Image generated and shown to the user above (style: {chosen}, "
f"checkpoint: {settings['ckpt']}). Do NOT describe the image, "
f"do NOT repeat any base64 or markdown — the user can see it. "
f"You may briefly note your style choice and offer one or two "
f"iteration ideas (different style, tighter framing, etc)."
)
async def edit_image(
self,
prompt: str,
style: Optional[StyleName] = None,
mask_text: Optional[str] = None,
denoise: Optional[float] = None,
negative_prompt: Optional[str] = None,
seed: int = 0,
__request__=None,
__user__: Optional[dict] = None,
__metadata__: Optional[dict] = None,
__files__: Optional[list] = None,
__messages__: Optional[list] = None,
__event_emitter__: Optional[Callable[[dict], Awaitable[None]]] = None,
) -> str:
"""
Edit, modify, transform, or restyle an image the user has ATTACHED
to the chat. Use whenever the user uploads an image and asks to
change it. If no image is attached, use generate_image instead.
TWO MODES — choose based on whether the change is local or global:
- LOCAL change ("change the ball to a basketball", "make the dog
wear a hat", "remove the bird") → set `mask_text` to a brief
noun phrase describing the region ("the ball", "the dog", "the
bird"). The tool uses GroundingDINO+SAM to find that region
automatically and only that area is repainted; the rest of the
image stays pixel-perfect.
- GLOBAL change ("make this a sunset", "turn this into anime",
"restyle this as oil painting") → leave `mask_text` unset. The
whole image is reimagined via img2img.
Always prefer LOCAL mode when the user names a specific object,
person, or region. GLOBAL mode is for whole-image style/lighting
transformations.
Denoise tuning:
- LOCAL (mask_text set): default 1.0 — full repaint within mask.
Drop to 0.60.8 for subtle local edits that should retain some
original structure.
- GLOBAL (no mask_text): default 0.7 — moderate edit. Use 0.30.5
for subtle restyling, 0.851.0 for radical reimagining.
Pick `style` for the DESIRED OUTPUT, not the input image.
Style resolution order: inherited from the most recent prior
generate_image / edit_image call in this conversation (DOMINANT)
→ explicit `style` arg → keyword detection on `prompt`.
Inheritance dominates because vision LLMs misclassify subjects
in the rendered output (e.g. picking 'juggernaut' on a
'furry-il' source). For follow-up edits on an image you
generated earlier, omit `style` entirely — the tool reuses the
established style automatically. The user can start a new chat
if they want a different style.
:param prompt: What the changed area should look like.
Tool auto-prepends quality tags — don't include those.
:param style: One of the StyleName values. Omit to auto-inherit
from the previous tool call (recommended for edits on
images you generated earlier in this chat).
:param mask_text: Noun phrase describing the region to edit. Set
for LOCAL changes; omit for GLOBAL.
:param denoise: 0.0 = no change, 1.0 = ignore source. Defaults to
1.0 with mask_text, 0.7 without.
:param negative_prompt: Extra terms to exclude. Usually unneeded.
:param seed: 0 to randomize, otherwise specific.
:return: Markdown image of the result, or an error if no image is attached.
"""
# Resolve style — inheritance DOMINATES for edits. Vision LLMs
# misclassify subject types (observed in the wild: juggernaut
# picked for a furry-il source because the model thought the
# rendered character looked "photoreal-ish"). When there's a
# prior tool call in this chat, use the same style; the user's
# workaround for genuine style changes is a fresh chat.
chosen = _inherited_style(__messages__) or style or _route_style(prompt)
settings = STYLES.get(chosen)
if not settings:
return f"Unknown style '{chosen}'. Available: {', '.join(STYLES.keys())}"
# Denoise default depends on mode: 1.0 (full repaint within mask)
# for inpainting, 0.7 for img2img.
if denoise is None:
denoise = 1.0 if mask_text else 0.7
denoise = max(0.0, min(1.0, denoise))
async def emit(msg: str, done: bool = False):
if __event_emitter__:
await __event_emitter__({
"type": "status",
"data": {"description": msg, "done": done},
})
base = self.valves.COMFYUI_BASE_URL.rstrip("/")
async with aiohttp.ClientSession() as session:
await emit("Looking for attached image…")
raw_in = await _extract_attached_image(
__files__, __messages__, __metadata__, session,
)
if raw_in is None:
msgs_with_files = sum(
1 for m in (__messages__ or [])
if isinstance(m, dict) and m.get("files")
)
chat_id_present = bool((__metadata__ or {}).get("chat_id"))
return (
"No image found in the chat. Diagnostics: "
f"__files__={len(__files__ or [])}, "
f"__messages__={len(__messages__ or [])} "
f"(of which {msgs_with_files} had a files field), "
f"chat_id_present={chat_id_present}, "
f"openwebui_runtime={_OPENWEBUI_RUNTIME}. "
"Ask the user to attach the image they want edited "
"(paperclip / drag-drop), or call generate_image instead."
)
# Diagnostic emit so a misrouted source ("wrong image
# returned") shows up in the status track instead of being
# invisible. SHA-1 is fast and the first 8 hex chars are
# plenty to compare against the prior generation's hash if
# cross-talk is suspected.
import hashlib # local import — keeps the module import surface clean
src_hash = hashlib.sha1(raw_in).hexdigest()[:8]
await emit(f"Uploading source to ComfyUI… (sha1={src_hash}, {len(raw_in)} bytes)")
uploaded_name = await _upload_to_comfyui(session, base, raw_in)
if not uploaded_name:
return "Failed to upload source image to ComfyUI."
mode = "inpaint" if mask_text else "img2img"
await emit(
f"Routing to {chosen} ({settings['ckpt']}), {mode}, denoise {denoise:.2f}"
+ (f", mask='{mask_text}'" if mask_text else "")
)
positive = f"{settings['prefix']}{prompt}"
negative = settings["negative"]
if negative_prompt:
negative = f"{negative}, {negative_prompt}"
if mask_text:
workflow = _build_inpaint(
positive=positive,
negative=negative,
settings=settings,
image_filename=uploaded_name,
mask_text=mask_text,
denoise=denoise,
seed=seed,
)
else:
workflow = _build_img2img(
positive=positive,
negative=negative,
settings=settings,
image_filename=uploaded_name,
denoise=denoise,
seed=seed,
)
raw_out, err = await _submit_and_fetch(
session, base, workflow, self.valves.TIMEOUT_SECONDS, emit, settings,
)
if err:
return err
pushed = await _push_image_to_chat(
raw_out, "smartedit", __request__, __user__, __metadata__, __event_emitter__,
)
if not pushed and __event_emitter__:
b64 = base64.b64encode(raw_out).decode("ascii")
await __event_emitter__({
"type": "message",
"data": {"content": f"![edit:{chosen}](data:image/png;base64,{b64})"},
})
await emit(f"Done — {chosen} (denoise {denoise:.2f})", done=True)
return (
f"Edited image shown to the user above (style: {chosen}, "
f"checkpoint: {settings['ckpt']}, denoise: {denoise:.2f}). Do NOT "
f"describe the image, do NOT repeat any base64 or markdown — the "
f"user can see it. You may briefly note your choice and offer "
f"iterations (different denoise, alternate style, etc)."
)