Open WebUI 0.9.0 made every model-class accessor (Users.get_user_by_id,
Chats.get_chat_by_id, Files.get_file_by_id, …) a coroutine. Both tools
were still calling them synchronously, so the calls returned coroutines
instead of model objects; the first downstream attribute access threw,
the bare `except Exception: return False` swallowed it, and uploads
silently fell through to the data-URI fallback. The data-URI markdown
rendered during streaming but didn't survive post-stream commit, which
looked like "image flashes in, then disappears."
Add await to the six call sites; promote `_read_file_dict` to async
since it now contains an await; restore `_push_image_to_chat` to the
canonical `files` event so the file-attachment chrome (thumbnail +
download) comes back.
This supersedes commit d034700, which mis-diagnosed the symptom as a
virtualization regression and switched to a `message`-event markdown
workaround. The workaround didn't help (same flash-and-vanish) because
the upload pre-check still failed for the same async-migration reason
and the data-URI fallback path still ran.
smart_image_gen.py 0.7.9 -> 0.7.10
smart_image_pipe.py 0.1.1 -> 0.1.2
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1025 lines
44 KiB
Python
1025 lines
44 KiB
Python
"""
|
||
title: Smart Image Generator & Editor (ComfyUI)
|
||
author: ai-stack
|
||
version: 0.7.10
|
||
description: Generate or edit images via ComfyUI with automatic SDXL
|
||
checkpoint routing. Two methods — generate_image (txt2img) and
|
||
edit_image (img2img on the user's most recently attached image). The
|
||
LLM picks (or auto-detects) the right model — photoreal, Pony
|
||
score-tag, NoobAI/Illustrious furry, etc. — and each style ships
|
||
with the creator-recommended sampler, scheduler, CFG, steps, CLIP
|
||
skip, prompt-prefix dialect, and negatives. The image is uploaded
|
||
to Open WebUI's file store and surfaced via a `files` event (the
|
||
canonical pattern used by Open WebUI's own image-gen path); the
|
||
function return is a short confirmation so the LLM doesn't try to
|
||
describe or re-emit the image.
|
||
required_open_webui_version: 0.5.0
|
||
"""
|
||
|
||
import asyncio
|
||
import base64
|
||
import inspect
|
||
import io
|
||
import json
|
||
import re
|
||
import time
|
||
import uuid
|
||
from typing import Awaitable, Callable, Literal, Optional
|
||
|
||
import aiohttp
|
||
from pydantic import BaseModel, Field
|
||
|
||
# Open WebUI's runtime — only available when the tool is loaded inside the
|
||
# Open WebUI process. Guarded so the module still imports for standalone
|
||
# linting/testing; if the imports fail at runtime, _push_image_to_chat
|
||
# falls back to emitting a markdown data-URI message.
|
||
try:
|
||
from fastapi import UploadFile
|
||
from open_webui.models.chats import Chats
|
||
from open_webui.models.files import Files
|
||
from open_webui.models.users import Users
|
||
from open_webui.routers.files import upload_file_handler
|
||
|
||
_OPENWEBUI_RUNTIME = True
|
||
except ImportError:
|
||
_OPENWEBUI_RUNTIME = False
|
||
|
||
StyleName = Literal[
|
||
"photo", "juggernaut", "pony", "general",
|
||
"furry-nai", "furry-noob", "furry-il",
|
||
]
|
||
|
||
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
# Per-style settings — sampler/scheduler/cfg/steps/clip_skip/prefix/negatives
|
||
# come from each model's creator page on Civitai. Three prefix dialects in
|
||
# play: photoreal (no prefix, natural language), Pony score chain (REQUIRED
|
||
# for any Pony-derived checkpoint), and Booru quality tags (NoobAI /
|
||
# Illustrious lineage). Never cross-contaminate.
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
|
||
STYLES = {
|
||
"photo": {
|
||
"ckpt": "CyberRealisticXLPlay_V8.0_FP16.safetensors",
|
||
"sampler": "dpmpp_2m_sde",
|
||
"scheduler": "karras",
|
||
"cfg": 4.0,
|
||
"steps": 28,
|
||
"clip_skip": 1,
|
||
"prefix": "", # natural language only — no quality tags
|
||
"negative": (
|
||
"cartoon, drawing, illustration, anime, manga, painting, sketch, "
|
||
"render, 3d, cgi, watercolor, plastic skin, doll-like, oversaturated, "
|
||
"lowres, blurry, jpeg artifacts, noisy, grainy, low quality, worst quality, "
|
||
"bad anatomy, deformed, mutated, extra limbs, extra fingers, missing fingers, "
|
||
"fused fingers, malformed hands, asymmetric face, "
|
||
"watermark, signature, text, logo, label, username"
|
||
),
|
||
},
|
||
"juggernaut": {
|
||
"ckpt": "Juggernaut-XL_v9_RunDiffusionPhoto_v2.safetensors",
|
||
"sampler": "dpmpp_2m_sde",
|
||
"scheduler": "karras",
|
||
"cfg": 4.5,
|
||
"steps": 35,
|
||
"clip_skip": 1,
|
||
"prefix": "", # natural language only
|
||
"negative": (
|
||
"cartoon, drawing, illustration, anime, manga, painting, sketch, "
|
||
"render, 3d, cgi, plastic skin, washed out, oversaturated, "
|
||
"lowres, blurry, jpeg artifacts, low quality, worst quality, "
|
||
"bad anatomy, deformed, mutated, extra limbs, extra fingers, missing fingers, "
|
||
"fused fingers, malformed hands, "
|
||
"watermark, signature, text, logo, username"
|
||
),
|
||
},
|
||
"pony": {
|
||
"ckpt": "ponyDiffusionV6XL_v6StartWithThisOne.safetensors",
|
||
"sampler": "euler_ancestral",
|
||
"scheduler": "normal",
|
||
"cfg": 7.5,
|
||
"steps": 25,
|
||
"clip_skip": 2,
|
||
# REQUIRED — the full chain. Just `score_9` alone is much weaker.
|
||
"prefix": "score_9, score_8_up, score_7_up, score_6_up, score_5_up, score_4_up, ",
|
||
# Pony's creator notes negatives are usually unnecessary; conservative
|
||
# baseline only. Source-toggle tags (source_pony/furry/anime/cartoon)
|
||
# are intentionally omitted — they exclude entire content domains.
|
||
"negative": (
|
||
"score_6, score_5, score_4, "
|
||
"worst quality, low quality, lowres, blurry, jpeg artifacts, noisy, "
|
||
"bad anatomy, bad proportions, bad hands, extra digit, fewer digits, "
|
||
"fused fingers, malformed limbs, deformed, ugly, "
|
||
"censored, monochrome, "
|
||
"watermark, signature, text, logo, artist name, patreon username, twitter username"
|
||
),
|
||
},
|
||
"general": {
|
||
"ckpt": "talmendoxlSDXL_v11Beta.safetensors",
|
||
"sampler": "dpmpp_2m",
|
||
"scheduler": "karras",
|
||
"cfg": 8.0, # Talmendo wants notably higher CFG than the others
|
||
"steps": 30,
|
||
"clip_skip": 2,
|
||
"prefix": "", # creator says don't push "masterpiece" — fights the amateur aesthetic
|
||
"negative": (
|
||
"lowres, blurry, jpeg artifacts, noisy, grainy, low quality, worst quality, "
|
||
"bad anatomy, deformed, mutated, extra limbs, missing fingers, fused fingers, "
|
||
"malformed hands, ugly, "
|
||
"watermark, signature, text, logo"
|
||
),
|
||
},
|
||
"furry-nai": {
|
||
"ckpt": "reedFURRYMixSDXL_v23nai.safetensors",
|
||
"sampler": "euler_ancestral",
|
||
"scheduler": "normal",
|
||
"cfg": 5.0,
|
||
"steps": 30,
|
||
"clip_skip": 2,
|
||
"prefix": (
|
||
"masterpiece, best quality, high quality, good quality, "
|
||
"detailed eyes, highres, absurdres, incredibly absurdres, "
|
||
),
|
||
"negative": (
|
||
"worst quality, bad_quality, normal quality, lowres, anatomical nonsense, "
|
||
"bad anatomy, anatomical nonsense, interlocked fingers, extra fingers, "
|
||
"watermark, simple background, transparent, bad_feet, bad_hands, "
|
||
"logo, text, bad_anatomy, signature, face backlighting, "
|
||
"(worst quality, bad quality:1.2), jpeg artifacts, censored, "
|
||
"extra digit, ugly, deformed anatomy, bad proportions, "
|
||
),
|
||
},
|
||
"furry-noob": {
|
||
"ckpt": "indigoVoidFurryFusedXL_noobaiV32.safetensors",
|
||
"sampler": "euler_ancestral", # creator: other samplers won't work
|
||
"scheduler": "normal",
|
||
"cfg": 4.5,
|
||
"steps": 20,
|
||
"clip_skip": 2,
|
||
"prefix": (
|
||
"masterpiece, best quality, perfect quality, absurdres, newest, "
|
||
"very aesthetic, vibrant colors, "
|
||
),
|
||
"negative": (
|
||
"human, realistic, photorealistic, 3d, cgi, "
|
||
"shiny skin, shiny clothing, "
|
||
"worst quality, low quality, lowres, blurry, jpeg artifacts, noisy, "
|
||
"bad anatomy, bad hands, mutated hands, bad proportions, "
|
||
"extra digit, fewer digits, fused fingers, malformed limbs, deformed, ugly, "
|
||
"watermark, signature, text, logo, username, artist signature"
|
||
),
|
||
},
|
||
"furry-il": {
|
||
"ckpt": "novaFurryXL_ilV170.safetensors",
|
||
"sampler": "euler_ancestral",
|
||
"scheduler": "normal",
|
||
"cfg": 4.0,
|
||
"steps": 30,
|
||
"clip_skip": 2,
|
||
# Illustrious wants `newest` in positive and `old`/`oldest` in negative
|
||
# — these are year-bucket tags from the training set. `furry` and
|
||
# `anthro` are universally helpful here.
|
||
"prefix": (
|
||
"masterpiece, best quality, amazing quality, very aesthetic, "
|
||
"high resolution, ultra-detailed, absurdres, newest, furry, anthro, "
|
||
),
|
||
"negative": (
|
||
"human, multiple tails, modern, recent, old, oldest, "
|
||
"graphic, cartoon, painting, crayon, graphite, abstract, glitch, "
|
||
"deformed, mutated, ugly, disfigured, long body, conjoined, "
|
||
"lowres, bad anatomy, bad hands, missing fingers, extra digits, fewer digits, "
|
||
"cropped, very displeasing, worst quality, bad quality, sketch, "
|
||
"jpeg artifacts, signature, watermark, username, text, simple background, "
|
||
"bad ai-generated"
|
||
),
|
||
},
|
||
}
|
||
|
||
DEFAULT_STYLE = "general"
|
||
|
||
# First-match-wins keyword router used when the caller didn't pick a style.
|
||
# Order matters — narrower patterns above broader ones.
|
||
ROUTING_RULES = [
|
||
# Pony score chain is the single strongest signal — Pony only
|
||
(re.compile(r"\bscore_\d", re.I), "pony"),
|
||
(re.compile(r"\bpony\b", re.I), "pony"),
|
||
# NoobAI / Illustrious explicit mentions
|
||
(re.compile(r"\b(noobai|noob)\b", re.I), "furry-noob"),
|
||
(re.compile(r"\b(illustrious|ilxl)\b", re.I), "furry-il"),
|
||
# Generic furry — defaults to NovaFurry (Illustrious lineage, current sweet spot)
|
||
(re.compile(r"\b(furry|anthro|feral|kemono|fursona|species)\b", re.I), "furry-il"),
|
||
# Photo / photoreal
|
||
(re.compile(r"\b(juggernaut)\b", re.I), "juggernaut"),
|
||
(re.compile(r"\b(photo|photograph|realistic|portrait|selfie|cinematic)\b", re.I), "photo"),
|
||
# Generic anime / illustration → Pony covers anime well
|
||
(re.compile(r"\b(anime|manga|2d|illustration)\b", re.I), "pony"),
|
||
]
|
||
|
||
|
||
def _route_style(prompt: str) -> str:
|
||
for pattern, style in ROUTING_RULES:
|
||
if pattern.search(prompt):
|
||
return style
|
||
return DEFAULT_STYLE
|
||
|
||
|
||
def _inherited_style(messages: Optional[list]) -> Optional[str]:
|
||
"""
|
||
Return the `style` arg from the most recent generate_image /
|
||
edit_image tool call in the conversation. Used so edit_image can
|
||
auto-inherit the style of the image being edited when the LLM
|
||
didn't pass one explicitly — without this, an edit on a furry
|
||
image with a neutral edit prompt ("make the eyes glow") falls
|
||
through to the keyword router and picks a wrong style.
|
||
"""
|
||
if not messages:
|
||
return None
|
||
for msg in reversed(messages):
|
||
if not isinstance(msg, dict):
|
||
continue
|
||
for tc in (msg.get("tool_calls") or []):
|
||
if not isinstance(tc, dict):
|
||
continue
|
||
fn = tc.get("function") or {}
|
||
if fn.get("name") not in ("generate_image", "edit_image"):
|
||
continue
|
||
raw_args = fn.get("arguments")
|
||
if isinstance(raw_args, str):
|
||
try:
|
||
args = json.loads(raw_args)
|
||
except (TypeError, ValueError):
|
||
args = {}
|
||
elif isinstance(raw_args, dict):
|
||
args = raw_args
|
||
else:
|
||
args = {}
|
||
style = args.get("style")
|
||
if isinstance(style, str) and style in STYLES:
|
||
return style
|
||
return None
|
||
|
||
|
||
def _seed_value(seed: int) -> int:
|
||
return seed if seed > 0 else int(time.time() * 1000) % (2**31)
|
||
|
||
|
||
def _job_prefix(kind: str) -> str:
|
||
"""Per-submission filename_prefix so SaveImage outputs from concurrent
|
||
jobs can never share an auto-numbered counter and cross over."""
|
||
return f"{kind}_{uuid.uuid4().hex[:10]}"
|
||
|
||
|
||
def _build_txt2img(positive: str, negative: str, settings: dict,
|
||
width: int, height: int, seed: int) -> dict:
|
||
"""
|
||
SDXL txt2img workflow. CLIP skip via CLIPSetLastLayer so the same graph
|
||
handles skip 1 (-1) and skip 2 (-2).
|
||
"""
|
||
return {
|
||
"3": {"class_type": "KSampler", "inputs": {
|
||
"seed": _seed_value(seed),
|
||
"steps": settings["steps"], "cfg": settings["cfg"],
|
||
"sampler_name": settings["sampler"], "scheduler": settings["scheduler"],
|
||
"denoise": 1.0,
|
||
"model": ["4", 0], "positive": ["6", 0],
|
||
"negative": ["7", 0], "latent_image": ["5", 0],
|
||
}},
|
||
"4": {"class_type": "CheckpointLoaderSimple",
|
||
"inputs": {"ckpt_name": settings["ckpt"]}},
|
||
"5": {"class_type": "EmptyLatentImage",
|
||
"inputs": {"width": width, "height": height, "batch_size": 1}},
|
||
"6": {"class_type": "CLIPTextEncode", "inputs": {"text": positive, "clip": ["10", 0]}},
|
||
"7": {"class_type": "CLIPTextEncode", "inputs": {"text": negative, "clip": ["10", 0]}},
|
||
"8": {"class_type": "VAEDecode", "inputs": {"samples": ["3", 0], "vae": ["4", 2]}},
|
||
"9": {"class_type": "SaveImage",
|
||
"inputs": {"filename_prefix": _job_prefix("smartgen"), "images": ["8", 0]}},
|
||
"10": {"class_type": "CLIPSetLastLayer",
|
||
"inputs": {"stop_at_clip_layer": -settings["clip_skip"],
|
||
"clip": ["4", 1]}},
|
||
}
|
||
|
||
|
||
def _build_inpaint(positive: str, negative: str, settings: dict,
|
||
image_filename: str, mask_text: str,
|
||
denoise: float, seed: int) -> dict:
|
||
"""
|
||
SDXL inpainting workflow with text-driven masking. Uses
|
||
comfyui_segment_anything (GroundingDINO + SAM-HQ — installed by the
|
||
Dockerfile) to derive a mask from `mask_text` (a noun phrase like
|
||
"the dog's collar"), then SetLatentNoiseMask + KSampler repaint
|
||
only that region. Everything outside the mask stays pixel-perfect.
|
||
|
||
The raw SAM mask is run through GrowMask with tapered_corners
|
||
before it reaches the sampler. Without that, the mask edge is
|
||
pixel-binary and KSampler repaints right up to a hard boundary —
|
||
SDXL has no surrounding-pixel context inside the mask to blend
|
||
with, so the inpainted region looks pasted-on with visible seams.
|
||
expand=12px + taper gives a soft transition that blends naturally.
|
||
|
||
First inpaint downloads ~3 GB of SAM/GroundingDINO weights into
|
||
/opt/comfyui/models/{sams,grounding-dino}/ — subsequent runs reuse
|
||
them.
|
||
"""
|
||
return {
|
||
"3": {"class_type": "KSampler", "inputs": {
|
||
"seed": _seed_value(seed),
|
||
"steps": settings["steps"], "cfg": settings["cfg"],
|
||
"sampler_name": settings["sampler"], "scheduler": settings["scheduler"],
|
||
"denoise": denoise,
|
||
"model": ["4", 0], "positive": ["6", 0],
|
||
"negative": ["7", 0], "latent_image": ["13", 0],
|
||
}},
|
||
"4": {"class_type": "CheckpointLoaderSimple",
|
||
"inputs": {"ckpt_name": settings["ckpt"]}},
|
||
"6": {"class_type": "CLIPTextEncode", "inputs": {"text": positive, "clip": ["10", 0]}},
|
||
"7": {"class_type": "CLIPTextEncode", "inputs": {"text": negative, "clip": ["10", 0]}},
|
||
"8": {"class_type": "VAEDecode", "inputs": {"samples": ["3", 0], "vae": ["4", 2]}},
|
||
"9": {"class_type": "SaveImage",
|
||
"inputs": {"filename_prefix": _job_prefix("smartinpaint"), "images": ["8", 0]}},
|
||
"10": {"class_type": "CLIPSetLastLayer",
|
||
"inputs": {"stop_at_clip_layer": -settings["clip_skip"],
|
||
"clip": ["4", 1]}},
|
||
"11": {"class_type": "VAEEncode", "inputs": {"pixels": ["12", 0], "vae": ["4", 2]}},
|
||
"12": {"class_type": "LoadImage", "inputs": {"image": image_filename}},
|
||
"13": {"class_type": "SetLatentNoiseMask",
|
||
"inputs": {"samples": ["11", 0], "mask": ["17", 0]}},
|
||
"14": {"class_type": "SAMModelLoader (segment anything)",
|
||
"inputs": {"model_name": "sam_hq_vit_h (2.57GB)"}},
|
||
"15": {"class_type": "GroundingDinoModelLoader (segment anything)",
|
||
"inputs": {"model_name": "GroundingDINO_SwinT_OGC (694MB)"}},
|
||
"16": {"class_type": "GroundingDinoSAMSegment (segment anything)",
|
||
"inputs": {
|
||
"sam_model": ["14", 0],
|
||
"grounding_dino_model": ["15", 0],
|
||
"image": ["12", 0],
|
||
"prompt": mask_text,
|
||
"threshold": 0.3,
|
||
}},
|
||
"17": {"class_type": "GrowMask",
|
||
"inputs": {
|
||
"mask": ["16", 1],
|
||
"expand": 12,
|
||
"tapered_corners": True,
|
||
}},
|
||
}
|
||
|
||
|
||
def _build_img2img(positive: str, negative: str, settings: dict,
|
||
image_filename: str, denoise: float, seed: int) -> dict:
|
||
"""
|
||
SDXL img2img workflow. Loads `image_filename` (already uploaded to
|
||
ComfyUI's /input/), VAE-encodes it to latent, and feeds that into the
|
||
sampler at the requested denoise. Resolution is whatever the source
|
||
image is — no resize.
|
||
"""
|
||
return {
|
||
"3": {"class_type": "KSampler", "inputs": {
|
||
"seed": _seed_value(seed),
|
||
"steps": settings["steps"], "cfg": settings["cfg"],
|
||
"sampler_name": settings["sampler"], "scheduler": settings["scheduler"],
|
||
"denoise": denoise,
|
||
"model": ["4", 0], "positive": ["6", 0],
|
||
"negative": ["7", 0], "latent_image": ["11", 0],
|
||
}},
|
||
"4": {"class_type": "CheckpointLoaderSimple",
|
||
"inputs": {"ckpt_name": settings["ckpt"]}},
|
||
"6": {"class_type": "CLIPTextEncode", "inputs": {"text": positive, "clip": ["10", 0]}},
|
||
"7": {"class_type": "CLIPTextEncode", "inputs": {"text": negative, "clip": ["10", 0]}},
|
||
"8": {"class_type": "VAEDecode", "inputs": {"samples": ["3", 0], "vae": ["4", 2]}},
|
||
"9": {"class_type": "SaveImage",
|
||
"inputs": {"filename_prefix": _job_prefix("smartedit"), "images": ["8", 0]}},
|
||
"10": {"class_type": "CLIPSetLastLayer",
|
||
"inputs": {"stop_at_clip_layer": -settings["clip_skip"],
|
||
"clip": ["4", 1]}},
|
||
"11": {"class_type": "VAEEncode", "inputs": {"pixels": ["12", 0], "vae": ["4", 2]}},
|
||
"12": {"class_type": "LoadImage", "inputs": {"image": image_filename}},
|
||
}
|
||
|
||
|
||
def _file_dict_is_image(f: dict) -> bool:
|
||
ftype = (f.get("type") or "").lower()
|
||
fname = (f.get("name") or f.get("filename") or "").lower()
|
||
return "image" in ftype or fname.endswith((".png", ".jpg", ".jpeg", ".webp"))
|
||
|
||
|
||
_FILE_URL_ID_RE = re.compile(r"/(?:api/v1/)?files/([0-9a-fA-F-]{8,})(?:/content)?")
|
||
|
||
|
||
async def _read_file_dict(f: dict) -> Optional[bytes]:
|
||
"""
|
||
Try to read raw bytes for one file dict. Tries in order:
|
||
1. Local filesystem path keys (covers user uploads with `path`).
|
||
2. Open WebUI's Files.get_file_by_id with f["id"] (covers files
|
||
the user uploaded via the file API).
|
||
3. Same lookup with the id parsed out of f["url"] (covers
|
||
assistant-emitted files where the message attachment is just
|
||
{"type":"image","url":"/api/v1/files/<uuid>/content"} —
|
||
no id field, no path field, but the URL has the id).
|
||
|
||
Async because Open WebUI 0.9.0 made every model-class accessor
|
||
a coroutine (Users / Chats / Files / etc.). Calling the sync
|
||
way returns a coroutine object instead of the model — silently
|
||
breaks downstream attribute access. Same reason the callers in
|
||
_extract_attached_image and _push_image_to_chat must await.
|
||
"""
|
||
for path_key in ("path", "filepath", "file_path"):
|
||
path = f.get(path_key)
|
||
if path:
|
||
try:
|
||
with open(path, "rb") as fh:
|
||
return fh.read()
|
||
except OSError:
|
||
pass
|
||
|
||
candidate_ids = []
|
||
if f.get("id"):
|
||
candidate_ids.append(f["id"])
|
||
url = f.get("url")
|
||
if url:
|
||
m = _FILE_URL_ID_RE.search(url)
|
||
if m:
|
||
candidate_ids.append(m.group(1))
|
||
|
||
if _OPENWEBUI_RUNTIME:
|
||
for fid in candidate_ids:
|
||
try:
|
||
file_model = await Files.get_file_by_id(fid)
|
||
if file_model is None:
|
||
continue
|
||
path = getattr(file_model, "path", None)
|
||
if not path:
|
||
meta = getattr(file_model, "meta", None) or {}
|
||
if isinstance(meta, dict):
|
||
path = meta.get("path")
|
||
else:
|
||
path = getattr(meta, "path", None)
|
||
if path:
|
||
try:
|
||
with open(path, "rb") as fh:
|
||
return fh.read()
|
||
except OSError:
|
||
pass
|
||
except Exception:
|
||
pass
|
||
|
||
return None
|
||
|
||
|
||
async def _extract_attached_image(
|
||
files: Optional[list],
|
||
messages: Optional[list],
|
||
metadata: Optional[dict],
|
||
session: aiohttp.ClientSession,
|
||
) -> Optional[bytes]:
|
||
"""
|
||
Find the most recent image in the chat — including images previously
|
||
emitted by this tool itself. Search order (most recent first):
|
||
|
||
1. Inline base64 data URIs in `image_url` content blocks of recent
|
||
messages (vision-model uploads, paste-from-clipboard).
|
||
2. Files attached to messages in the conversation, scanned in
|
||
REVERSE so the newest image wins. This covers two cases:
|
||
a. Files the user just attached (current user message).
|
||
b. Files the assistant emitted via prior `generate_image` /
|
||
`edit_image` calls (attached to assistant messages by the
|
||
`files` event in _push_image_to_chat).
|
||
3. The __files__ tool param as a final fallback (some Open WebUI
|
||
versions pass user uploads here instead of on the message).
|
||
4. Best-effort URL fetch on any leftover file dict (likely fails
|
||
on auth-protected endpoints — last resort).
|
||
"""
|
||
# 1. Inline data URIs on recent messages.
|
||
for msg in reversed(messages or []):
|
||
content = msg.get("content") if isinstance(msg, dict) else None
|
||
if isinstance(content, list):
|
||
for block in content:
|
||
if not isinstance(block, dict) or block.get("type") != "image_url":
|
||
continue
|
||
url = (block.get("image_url") or {}).get("url", "")
|
||
if url.startswith("data:image"):
|
||
try:
|
||
return base64.b64decode(url.split(",", 1)[1])
|
||
except Exception:
|
||
pass
|
||
|
||
# 2. Files on messages, newest first.
|
||
for msg in reversed(messages or []):
|
||
if not isinstance(msg, dict):
|
||
continue
|
||
msg_files = msg.get("files")
|
||
if not isinstance(msg_files, list):
|
||
continue
|
||
for f in msg_files:
|
||
if not isinstance(f, dict) or not _file_dict_is_image(f):
|
||
continue
|
||
data = await _read_file_dict(f)
|
||
if data is not None:
|
||
return data
|
||
|
||
# 3. __files__ param (current user upload, sometimes only here).
|
||
for f in files or []:
|
||
if not isinstance(f, dict) or not _file_dict_is_image(f):
|
||
continue
|
||
data = await _read_file_dict(f)
|
||
if data is not None:
|
||
return data
|
||
|
||
# 4. Pull the chat from the database directly. Open WebUI persists
|
||
# `files` on every message via the upsert in socket/main.py — so even
|
||
# if __messages__ doesn't hydrate the assistant-emitted attachments,
|
||
# the chat record does. This is the strongest fallback.
|
||
if _OPENWEBUI_RUNTIME and metadata:
|
||
chat_id = metadata.get("chat_id")
|
||
if chat_id:
|
||
try:
|
||
chat = await Chats.get_chat_by_id(chat_id)
|
||
chat_data = getattr(chat, "chat", None) if chat else None
|
||
chat_messages = (chat_data or {}).get("messages", []) if isinstance(chat_data, dict) else []
|
||
for msg in reversed(chat_messages):
|
||
if not isinstance(msg, dict):
|
||
continue
|
||
msg_files = msg.get("files") or []
|
||
for f in msg_files:
|
||
if not isinstance(f, dict) or not _file_dict_is_image(f):
|
||
continue
|
||
data = await _read_file_dict(f)
|
||
if data is not None:
|
||
return data
|
||
except Exception:
|
||
pass
|
||
|
||
# 5. Last-resort URL fetch (no auth — only works for public endpoints).
|
||
for source in [files or []] + [
|
||
(msg.get("files") or []) for msg in reversed(messages or []) if isinstance(msg, dict)
|
||
]:
|
||
for f in source:
|
||
if not isinstance(f, dict) or not _file_dict_is_image(f):
|
||
continue
|
||
url = f.get("url")
|
||
if not url:
|
||
continue
|
||
full = url if url.startswith("http") else f"http://localhost:8080{url}"
|
||
try:
|
||
async with session.get(full) as resp:
|
||
if resp.status == 200:
|
||
return await resp.read()
|
||
except aiohttp.ClientError:
|
||
pass
|
||
|
||
return None
|
||
|
||
|
||
async def _upload_to_comfyui(
|
||
session: aiohttp.ClientSession, base: str, raw: bytes
|
||
) -> Optional[str]:
|
||
"""POST raw bytes to ComfyUI /upload/image and return the saved name."""
|
||
name = f"smartedit_{uuid.uuid4().hex[:12]}.png"
|
||
form = aiohttp.FormData()
|
||
form.add_field("image", raw, filename=name, content_type="image/png")
|
||
form.add_field("overwrite", "true")
|
||
async with session.post(f"{base}/upload/image", data=form) as resp:
|
||
if resp.status != 200:
|
||
return None
|
||
return (await resp.json()).get("name", name)
|
||
|
||
|
||
async def _push_image_to_chat(
|
||
raw: bytes,
|
||
filename_prefix: str,
|
||
request,
|
||
user_dict: Optional[dict],
|
||
metadata: Optional[dict],
|
||
event_emitter: Optional[Callable[[dict], Awaitable[None]]],
|
||
) -> bool:
|
||
"""
|
||
Surface a generated image in the chat using Open WebUI's canonical
|
||
pattern: upload the bytes via the internal file store, then emit a
|
||
`files` event referencing the served URL. This is the same path Open
|
||
WebUI's own image-generation code uses (utils/middleware.py ~1325).
|
||
|
||
Returns True if the image was uploaded and emitted via the files
|
||
event. Returns False if anything is missing — caller should fall
|
||
back to a data-URI markdown message in that case.
|
||
"""
|
||
if not (_OPENWEBUI_RUNTIME and request and user_dict and event_emitter):
|
||
return False
|
||
|
||
try:
|
||
user = await Users.get_user_by_id(user_dict.get("id"))
|
||
if not user:
|
||
return False
|
||
|
||
upload = UploadFile(
|
||
file=io.BytesIO(raw),
|
||
filename=f"{filename_prefix}_{uuid.uuid4().hex[:8]}.png",
|
||
headers={"content-type": "image/png"},
|
||
)
|
||
meta = metadata or {}
|
||
result = upload_file_handler(
|
||
request=request,
|
||
file=upload,
|
||
metadata={
|
||
"chat_id": meta.get("chat_id"),
|
||
"message_id": meta.get("message_id"),
|
||
},
|
||
process=False,
|
||
user=user,
|
||
)
|
||
# upload_file_handler may be sync or async depending on the Open
|
||
# WebUI version — handle either.
|
||
if inspect.iscoroutine(result):
|
||
file_item = await result
|
||
else:
|
||
file_item = result
|
||
|
||
url = request.app.url_path_for(
|
||
"get_file_content_by_id", id=file_item.id
|
||
)
|
||
|
||
await event_emitter({
|
||
"type": "files",
|
||
"data": {"files": [{"type": "image", "url": url}]},
|
||
})
|
||
return True
|
||
except Exception:
|
||
# Any failure (signature drift, missing route, etc.) falls back
|
||
# to the data-URI path in the caller.
|
||
return False
|
||
|
||
|
||
async def _submit_and_fetch(
|
||
session: aiohttp.ClientSession,
|
||
base: str,
|
||
workflow: dict,
|
||
timeout_seconds: int,
|
||
emit: Callable[[str, bool], Awaitable[None]],
|
||
settings: dict,
|
||
) -> tuple[Optional[bytes], Optional[str]]:
|
||
"""Submit a workflow, poll history, fetch the first output image. Returns
|
||
(image_bytes, error_message)."""
|
||
client_id = str(uuid.uuid4())
|
||
|
||
async with session.post(
|
||
f"{base}/prompt", json={"prompt": workflow, "client_id": client_id}
|
||
) as resp:
|
||
if resp.status != 200:
|
||
return None, f"ComfyUI rejected the prompt: {resp.status} {await resp.text()}"
|
||
prompt_id = (await resp.json()).get("prompt_id")
|
||
if not prompt_id:
|
||
return None, "ComfyUI didn't return a prompt_id."
|
||
|
||
await emit(
|
||
f"Sampling — {settings['sampler']}/{settings['scheduler']}, "
|
||
f"CFG {settings['cfg']}, {settings['steps']} steps", False
|
||
)
|
||
|
||
# The SaveImage node in every workflow this tool builds is id "9".
|
||
# We prefer it explicitly because intermediate nodes (e.g. the
|
||
# GroundingDinoSAMSegment IMAGE output in the inpaint workflow) can
|
||
# land in the outputs dict too, and dict iteration order is not
|
||
# stable across runs — without preferring "9" we sometimes returned
|
||
# an overlay or masked-only image that rendered mostly black.
|
||
SAVE_NODE_ID = "9"
|
||
|
||
deadline = time.time() + timeout_seconds
|
||
output_images: list = []
|
||
while time.time() < deadline:
|
||
await asyncio.sleep(1.5)
|
||
async with session.get(f"{base}/history/{prompt_id}") as resp:
|
||
if resp.status != 200:
|
||
continue
|
||
history = await resp.json()
|
||
if prompt_id in history:
|
||
outputs = history[prompt_id].get("outputs", {}) or {}
|
||
# Prefer the canonical SaveImage output …
|
||
save_imgs = (outputs.get(SAVE_NODE_ID) or {}).get("images", [])
|
||
if save_imgs:
|
||
output_images.extend(save_imgs)
|
||
# … only fall back to other nodes if SaveImage didn't fire
|
||
# (workflow drift, manual override, etc.)
|
||
if not output_images:
|
||
for node_out in outputs.values():
|
||
output_images.extend(node_out.get("images", []))
|
||
if output_images:
|
||
break
|
||
|
||
if not output_images:
|
||
return None, f"Timed out after {timeout_seconds}s waiting for image."
|
||
|
||
img = output_images[0]
|
||
params = {
|
||
"filename": img["filename"],
|
||
"subfolder": img.get("subfolder", ""),
|
||
"type": img.get("type", "output"),
|
||
}
|
||
async with session.get(f"{base}/view", params=params) as resp:
|
||
if resp.status != 200:
|
||
return None, f"Failed to fetch image: {resp.status}"
|
||
return await resp.read(), None
|
||
|
||
|
||
class Tools:
|
||
class Valves(BaseModel):
|
||
COMFYUI_BASE_URL: str = Field(
|
||
default="http://comfyui:8188",
|
||
description="ComfyUI server URL reachable from the open-webui container.",
|
||
)
|
||
TIMEOUT_SECONDS: int = Field(
|
||
default=600,
|
||
description=(
|
||
"Maximum wait for a single generation to complete. "
|
||
"Default 10 minutes — long enough to absorb a first-time "
|
||
"inpaint where SAM-HQ + GroundingDINO + BERT auto-download "
|
||
"(~3 GB). Steady-state runs finish in well under a minute; "
|
||
"if your KSampler routinely takes longer than that, lower "
|
||
"the per-style steps in STYLES."
|
||
),
|
||
)
|
||
|
||
def __init__(self):
|
||
self.valves = self.Valves()
|
||
|
||
async def generate_image(
|
||
self,
|
||
prompt: str,
|
||
style: Optional[StyleName] = None,
|
||
negative_prompt: Optional[str] = None,
|
||
width: int = 1024,
|
||
height: int = 1024,
|
||
seed: int = 0,
|
||
__request__=None,
|
||
__user__: Optional[dict] = None,
|
||
__metadata__: Optional[dict] = None,
|
||
__event_emitter__: Optional[Callable[[dict], Awaitable[None]]] = None,
|
||
) -> str:
|
||
"""
|
||
Create a NEW image from scratch and show it to the user. Use this
|
||
whenever the user asks you to draw, generate, create, make, paint,
|
||
render, or imagine any visual content — photographs, portraits,
|
||
characters, scenes, illustrations, anime, drawings — and they have
|
||
NOT attached an existing image. If they did attach an image and
|
||
want it modified, use edit_image instead.
|
||
|
||
Pick `style` to match what the user wants:
|
||
- "photo" — photorealistic photographs, portraits, cinematic shots.
|
||
- "juggernaut" — alternate photoreal style (sharper, more saturated).
|
||
- "pony" — anime / illustration / cartoon (Pony Diffusion).
|
||
- "general" — fallback for anything that doesn't fit the others.
|
||
- "furry-nai" — anthropomorphic characters (NAI-trained mix).
|
||
- "furry-noob" — anthropomorphic characters (NoobAI base).
|
||
- "furry-il" — anthropomorphic characters (Illustrious base, default
|
||
for any "furry" / "anthro" request unless specified otherwise).
|
||
|
||
Each style auto-prepends the right quality tags and picks the right
|
||
sampler / CFG / steps / CLIP skip. Do NOT add tags like
|
||
"masterpiece" or "score_9" to `prompt` yourself; the tool handles
|
||
that.
|
||
|
||
:param prompt: Plain description of the image (subject, scene,
|
||
style notes, lighting, etc.). No quality tags.
|
||
:param style: One of the values above. Omit to auto-detect.
|
||
:param negative_prompt: Extra terms to exclude. Usually unneeded.
|
||
:param width: Pixels (default 1024 — SDXL native). For portraits
|
||
use 832 with height 1216; for landscapes 1216 with height 832.
|
||
:param height: Pixels (default 1024).
|
||
:param seed: 0 to randomize, otherwise a specific seed for repeats.
|
||
:return: Markdown image of the result.
|
||
"""
|
||
chosen = style or _route_style(prompt)
|
||
settings = STYLES.get(chosen)
|
||
if not settings:
|
||
return f"Unknown style '{chosen}'. Available: {', '.join(STYLES.keys())}"
|
||
|
||
async def emit(msg: str, done: bool = False):
|
||
if __event_emitter__:
|
||
await __event_emitter__({
|
||
"type": "status",
|
||
"data": {"description": msg, "done": done},
|
||
})
|
||
|
||
await emit(f"Routing to {chosen} ({settings['ckpt']})")
|
||
|
||
positive = f"{settings['prefix']}{prompt}"
|
||
negative = settings["negative"]
|
||
if negative_prompt:
|
||
negative = f"{negative}, {negative_prompt}"
|
||
|
||
workflow = _build_txt2img(positive, negative, settings, width, height, seed)
|
||
base = self.valves.COMFYUI_BASE_URL.rstrip("/")
|
||
|
||
async with aiohttp.ClientSession() as session:
|
||
raw, err = await _submit_and_fetch(
|
||
session, base, workflow, self.valves.TIMEOUT_SECONDS, emit, settings,
|
||
)
|
||
if err:
|
||
return err
|
||
|
||
# Surface the image in the chat. Preferred path uploads to Open
|
||
# WebUI's file store and emits a `files` event (matches the built-
|
||
# in image-gen flow). Fallback inlines a data-URI markdown via a
|
||
# `message` event for environments where the file API isn't
|
||
# reachable from the tool process.
|
||
pushed = await _push_image_to_chat(
|
||
raw, "smartgen", __request__, __user__, __metadata__, __event_emitter__,
|
||
)
|
||
if not pushed and __event_emitter__:
|
||
b64 = base64.b64encode(raw).decode("ascii")
|
||
await __event_emitter__({
|
||
"type": "message",
|
||
"data": {"content": f""},
|
||
})
|
||
|
||
await emit(f"Done — {chosen}", done=True)
|
||
return (
|
||
f"Image generated and shown to the user above (style: {chosen}, "
|
||
f"checkpoint: {settings['ckpt']}). Do NOT describe the image, "
|
||
f"do NOT repeat any base64 or markdown — the user can see it. "
|
||
f"You may briefly note your style choice and offer one or two "
|
||
f"iteration ideas (different style, tighter framing, etc)."
|
||
)
|
||
|
||
async def edit_image(
|
||
self,
|
||
prompt: str,
|
||
style: Optional[StyleName] = None,
|
||
mask_text: Optional[str] = None,
|
||
denoise: Optional[float] = None,
|
||
negative_prompt: Optional[str] = None,
|
||
seed: int = 0,
|
||
__request__=None,
|
||
__user__: Optional[dict] = None,
|
||
__metadata__: Optional[dict] = None,
|
||
__files__: Optional[list] = None,
|
||
__messages__: Optional[list] = None,
|
||
__event_emitter__: Optional[Callable[[dict], Awaitable[None]]] = None,
|
||
) -> str:
|
||
"""
|
||
Edit, modify, transform, or restyle an image the user has ATTACHED
|
||
to the chat. Use whenever the user uploads an image and asks to
|
||
change it. If no image is attached, use generate_image instead.
|
||
|
||
TWO MODES — choose based on whether the change is local or global:
|
||
|
||
- LOCAL change ("change the ball to a basketball", "make the dog
|
||
wear a hat", "remove the bird") → set `mask_text` to a brief
|
||
noun phrase describing the region ("the ball", "the dog", "the
|
||
bird"). The tool uses GroundingDINO+SAM to find that region
|
||
automatically and only that area is repainted; the rest of the
|
||
image stays pixel-perfect.
|
||
|
||
- GLOBAL change ("make this a sunset", "turn this into anime",
|
||
"restyle this as oil painting") → leave `mask_text` unset. The
|
||
whole image is reimagined via img2img.
|
||
|
||
Always prefer LOCAL mode when the user names a specific object,
|
||
person, or region. GLOBAL mode is for whole-image style/lighting
|
||
transformations.
|
||
|
||
Denoise tuning:
|
||
- LOCAL (mask_text set): default 1.0 — full repaint within mask.
|
||
Drop to 0.6–0.8 for subtle local edits that should retain some
|
||
original structure.
|
||
- GLOBAL (no mask_text): default 0.7 — moderate edit. Use 0.3–0.5
|
||
for subtle restyling, 0.85–1.0 for radical reimagining.
|
||
|
||
Pick `style` for the DESIRED OUTPUT, not the input image.
|
||
|
||
Style resolution order: inherited from the most recent prior
|
||
generate_image / edit_image call in this conversation (DOMINANT)
|
||
→ explicit `style` arg → keyword detection on `prompt`.
|
||
Inheritance dominates because vision LLMs misclassify subjects
|
||
in the rendered output (e.g. picking 'juggernaut' on a
|
||
'furry-il' source). For follow-up edits on an image you
|
||
generated earlier, omit `style` entirely — the tool reuses the
|
||
established style automatically. The user can start a new chat
|
||
if they want a different style.
|
||
|
||
:param prompt: What the changed area should look like.
|
||
Tool auto-prepends quality tags — don't include those.
|
||
:param style: One of the StyleName values. Omit to auto-inherit
|
||
from the previous tool call (recommended for edits on
|
||
images you generated earlier in this chat).
|
||
:param mask_text: Noun phrase describing the region to edit. Set
|
||
for LOCAL changes; omit for GLOBAL.
|
||
:param denoise: 0.0 = no change, 1.0 = ignore source. Defaults to
|
||
1.0 with mask_text, 0.7 without.
|
||
:param negative_prompt: Extra terms to exclude. Usually unneeded.
|
||
:param seed: 0 to randomize, otherwise specific.
|
||
:return: Markdown image of the result, or an error if no image is attached.
|
||
"""
|
||
# Resolve style — inheritance DOMINATES for edits. Vision LLMs
|
||
# misclassify subject types (observed in the wild: juggernaut
|
||
# picked for a furry-il source because the model thought the
|
||
# rendered character looked "photoreal-ish"). When there's a
|
||
# prior tool call in this chat, use the same style; the user's
|
||
# workaround for genuine style changes is a fresh chat.
|
||
chosen = _inherited_style(__messages__) or style or _route_style(prompt)
|
||
settings = STYLES.get(chosen)
|
||
if not settings:
|
||
return f"Unknown style '{chosen}'. Available: {', '.join(STYLES.keys())}"
|
||
|
||
# Denoise default depends on mode: 1.0 (full repaint within mask)
|
||
# for inpainting, 0.7 for img2img.
|
||
if denoise is None:
|
||
denoise = 1.0 if mask_text else 0.7
|
||
denoise = max(0.0, min(1.0, denoise))
|
||
|
||
async def emit(msg: str, done: bool = False):
|
||
if __event_emitter__:
|
||
await __event_emitter__({
|
||
"type": "status",
|
||
"data": {"description": msg, "done": done},
|
||
})
|
||
|
||
base = self.valves.COMFYUI_BASE_URL.rstrip("/")
|
||
|
||
async with aiohttp.ClientSession() as session:
|
||
await emit("Looking for attached image…")
|
||
raw_in = await _extract_attached_image(
|
||
__files__, __messages__, __metadata__, session,
|
||
)
|
||
if raw_in is None:
|
||
msgs_with_files = sum(
|
||
1 for m in (__messages__ or [])
|
||
if isinstance(m, dict) and m.get("files")
|
||
)
|
||
chat_id_present = bool((__metadata__ or {}).get("chat_id"))
|
||
return (
|
||
"No image found in the chat. Diagnostics: "
|
||
f"__files__={len(__files__ or [])}, "
|
||
f"__messages__={len(__messages__ or [])} "
|
||
f"(of which {msgs_with_files} had a files field), "
|
||
f"chat_id_present={chat_id_present}, "
|
||
f"openwebui_runtime={_OPENWEBUI_RUNTIME}. "
|
||
"Ask the user to attach the image they want edited "
|
||
"(paperclip / drag-drop), or call generate_image instead."
|
||
)
|
||
|
||
# Diagnostic emit so a misrouted source ("wrong image
|
||
# returned") shows up in the status track instead of being
|
||
# invisible. SHA-1 is fast and the first 8 hex chars are
|
||
# plenty to compare against the prior generation's hash if
|
||
# cross-talk is suspected.
|
||
import hashlib # local import — keeps the module import surface clean
|
||
src_hash = hashlib.sha1(raw_in).hexdigest()[:8]
|
||
await emit(f"Uploading source to ComfyUI… (sha1={src_hash}, {len(raw_in)} bytes)")
|
||
uploaded_name = await _upload_to_comfyui(session, base, raw_in)
|
||
if not uploaded_name:
|
||
return "Failed to upload source image to ComfyUI."
|
||
|
||
mode = "inpaint" if mask_text else "img2img"
|
||
await emit(
|
||
f"Routing to {chosen} ({settings['ckpt']}), {mode}, denoise {denoise:.2f}"
|
||
+ (f", mask='{mask_text}'" if mask_text else "")
|
||
)
|
||
|
||
positive = f"{settings['prefix']}{prompt}"
|
||
negative = settings["negative"]
|
||
if negative_prompt:
|
||
negative = f"{negative}, {negative_prompt}"
|
||
|
||
if mask_text:
|
||
workflow = _build_inpaint(
|
||
positive=positive,
|
||
negative=negative,
|
||
settings=settings,
|
||
image_filename=uploaded_name,
|
||
mask_text=mask_text,
|
||
denoise=denoise,
|
||
seed=seed,
|
||
)
|
||
else:
|
||
workflow = _build_img2img(
|
||
positive=positive,
|
||
negative=negative,
|
||
settings=settings,
|
||
image_filename=uploaded_name,
|
||
denoise=denoise,
|
||
seed=seed,
|
||
)
|
||
|
||
raw_out, err = await _submit_and_fetch(
|
||
session, base, workflow, self.valves.TIMEOUT_SECONDS, emit, settings,
|
||
)
|
||
if err:
|
||
return err
|
||
|
||
pushed = await _push_image_to_chat(
|
||
raw_out, "smartedit", __request__, __user__, __metadata__, __event_emitter__,
|
||
)
|
||
if not pushed and __event_emitter__:
|
||
b64 = base64.b64encode(raw_out).decode("ascii")
|
||
await __event_emitter__({
|
||
"type": "message",
|
||
"data": {"content": f""},
|
||
})
|
||
|
||
await emit(f"Done — {chosen} (denoise {denoise:.2f})", done=True)
|
||
return (
|
||
f"Edited image shown to the user above (style: {chosen}, "
|
||
f"checkpoint: {settings['ckpt']}, denoise: {denoise:.2f}). Do NOT "
|
||
f"describe the image, do NOT repeat any base64 or markdown — the "
|
||
f"user can see it. You may briefly note your choice and offer "
|
||
f"iterations (different denoise, alternate style, etc)."
|
||
)
|