From d4e2058859abc1a6b0ef74038a3a4e877af980fa Mon Sep 17 00:00:00 2001 From: William Gill Date: Sun, 19 Apr 2026 12:59:13 -0500 Subject: [PATCH] smart_image_gen v0.3: add edit_image (img2img) method MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Tool now exposes two methods the LLM picks between based on whether the user attached an image: generate_image — txt2img (existing, unchanged behavior) edit_image — img2img on the most recently attached image edit_image extracts the source image from __messages__ (base64 data URIs in image_url content blocks) or __files__ (local path or URL), uploads to ComfyUI's /upload/image, runs an img2img workflow at the caller-specified denoise (default 0.7), and returns the edited result. Same per-style routing / sampler / CFG / prefix logic as generation. Refactored the submit-and-poll loop into _submit_and_fetch shared by both methods. Image extraction is defensive — tries messages first, then files (path then URL), returns a clear "no image attached" message rather than silently generating from scratch. Image Studio system prompt rewritten to teach the LLM when to call edit_image vs generate_image and how to pick denoise. Co-Authored-By: Claude Opus 4.7 (1M context) --- deployments/ai-stack/README.md | 12 +- .../ai-stack/openwebui-models/image_studio.md | 47 +- .../openwebui-tools/smart_image_gen.py | 400 +++++++++++++----- 3 files changed, 342 insertions(+), 117 deletions(-) diff --git a/deployments/ai-stack/README.md b/deployments/ai-stack/README.md index 65979cc..c699cb2 100644 --- a/deployments/ai-stack/README.md +++ b/deployments/ai-stack/README.md @@ -16,7 +16,7 @@ production `srvno.de` deployment. | `Caddyfile` | TLS + reverse proxy config (one site block per hostname) | | `init-models.sh` | LLMs to preseed into Ollama on first boot | | `comfyui-init-models.sh` | Checkpoints/VAEs/LoRAs to preseed into ComfyUI on first boot | -| `openwebui-tools/smart_image_gen.py` | Tool that auto-routes image generation to the right SDXL checkpoint | +| `openwebui-tools/smart_image_gen.py` | Tool that auto-routes image generation AND editing to the right SDXL checkpoint | | `openwebui-models/image_studio.md` | Dedicated chat-model preset — system prompt that forces tool use | | `.env.example` | Secrets and image-tag pins. Copy to `.env` | @@ -153,8 +153,14 @@ use the edit action. The image-button path always uses the admin's **Default Model**. To get per-prompt checkpoint routing — e.g. "draw me a cyberpunk city" picks CyberRealistic, "anthro fox warrior" picks one of the furry checkpoints — -install the `smart_image_gen.py` Tool. The LLM calls it instead of the -built-in image action and chooses the right SDXL checkpoint per request. +install the `smart_image_gen.py` Tool. It exposes two methods the LLM +calls: + +- **`generate_image`** for new images from scratch (txt2img). +- **`edit_image`** for modifying an image the user attached to the + chat (img2img). + +Both auto-route to the right SDXL checkpoint per request. 1. **Workspace -> Tools -> +** (top-right). 2. Paste the contents of diff --git a/deployments/ai-stack/openwebui-models/image_studio.md b/deployments/ai-stack/openwebui-models/image_studio.md index cb51b10..dad7378 100644 --- a/deployments/ai-stack/openwebui-models/image_studio.md +++ b/deployments/ai-stack/openwebui-models/image_studio.md @@ -3,7 +3,9 @@ A custom Open WebUI model preset that wraps a base LLM with a system prompt heavily biased toward calling the `smart_image_gen` tool. Users pick **Image Studio** from the chat-model dropdown when they want to -generate images, and the LLM treats every message as an image request. +generate or edit images, and the LLM treats every message as an image +request — calling `generate_image` for new images and `edit_image` for +modifications to attached ones. This exists because general-purpose chat models often "describe" an image in text instead of calling the tool, especially when the request @@ -38,12 +40,19 @@ access. ``` You are Image Studio, a focused image-generation assistant. Your only -purpose is to create images for the user via the generate_image tool. +purpose is to create or edit images for the user using the +generate_image and edit_image tools. + +DECIDE WHICH TOOL TO USE: +- The user attached an image AND wants it changed → call edit_image. + Trigger phrasings: "change this", "modify", "make it look like", + "turn this into", "add a hat", "remove the background", + "restyle this", "what if this were an oil painting", etc. +- Otherwise → call generate_image. Trigger phrasings: "draw", "make me", + "show me", "I want a picture of", "create", "generate", "render", + "imagine", "can you do", etc. ALWAYS: -- Call generate_image(prompt, style) for every image request, no matter - how it is phrased — "draw", "make me", "show me", "I want a picture - of", "create", "generate", "render", "imagine", "can you do", etc. - Pick the style that fits what the user asked for: * photo — photorealistic photographs, portraits, cinematic * juggernaut — alternate photoreal style, sharper and saturated @@ -53,27 +62,33 @@ ALWAYS: * furry-noob — anthropomorphic, NoobAI base * furry-il — anthropomorphic, Illustrious base (default for unspecified furry / anthro requests) +- For edit_image, pick `style` based on the DESIRED OUTPUT, not what + the input image looks like. - Write rich, descriptive prompts: subject, action, environment, lighting, mood, composition, camera framing, style cues. Expand - short user requests into a fuller scene description. + short user requests into fuller descriptions. +- For edits, choose denoise based on intent: 0.3–0.5 for subtle + recoloring or style transfer, 0.6–0.8 for adding/removing objects + (default 0.7), 0.85–1.0 for radical reimaginings. - If the user is vague, make confident creative choices and proceed. - Generate first, then offer variations or refinements. + Generate first, then offer variations. NEVER: -- Say you cannot generate images. The generate_image tool exists for - exactly this purpose. -- Describe what an image would look like in text instead of generating +- Say you cannot generate or edit images. Both tools exist for this. +- Describe what an image would look like in text instead of producing it. -- Refuse because the prompt is too short or too vague — make - reasonable assumptions and call the tool. +- Refuse because the prompt is too short or vague — make reasonable + assumptions and call the tool. - Include quality tags like "masterpiece", "best quality", "score_9", - or "absurdres" in your prompt; the tool prepends the right tags for + or "absurdres" in your prompt; the tools prepend the right tags for whichever style you pick. -- Set sampler, CFG, steps, or scheduler — the tool picks per style. +- Set sampler, CFG, steps, or scheduler — the tools pick per style. +- Try to generate when the user clearly meant to edit (or vice versa). After the image appears, briefly note the style/checkpoint you chose -and offer one or two concrete iteration paths (different style, -alternate composition, tighter framing, seed variations). +(and denoise for edits) and offer one or two concrete iteration paths +— different style, tighter framing, higher/lower denoise, alternate +composition, seed variations. ``` ## Why this works when a generic chat model didn't diff --git a/deployments/ai-stack/openwebui-tools/smart_image_gen.py b/deployments/ai-stack/openwebui-tools/smart_image_gen.py index 9e3f515..049f27d 100644 --- a/deployments/ai-stack/openwebui-tools/smart_image_gen.py +++ b/deployments/ai-stack/openwebui-tools/smart_image_gen.py @@ -1,13 +1,14 @@ """ -title: Smart Image Generator (ComfyUI) +title: Smart Image Generator & Editor (ComfyUI) author: ai-stack -version: 0.2.0 -description: Generate images via ComfyUI with automatic SDXL checkpoint - routing. The LLM picks (or auto-detects) the right model — photoreal, - Pony score-tag, NoobAI/Illustrious furry, etc. — based on the user's - request. Each style ships with the creator-recommended sampler, - scheduler, CFG, steps, CLIP skip, prompt-prefix dialect, and - negatives. +version: 0.3.0 +description: Generate or edit images via ComfyUI with automatic SDXL + checkpoint routing. Two methods — generate_image (txt2img) and + edit_image (img2img on the user's most recently attached image). The + LLM picks (or auto-detects) the right model — photoreal, Pony + score-tag, NoobAI/Illustrious furry, etc. — and each style ships + with the creator-recommended sampler, scheduler, CFG, steps, CLIP + skip, prompt-prefix dialect, and negatives. required_open_webui_version: 0.5.0 """ @@ -201,36 +202,32 @@ def _route_style(prompt: str) -> str: return DEFAULT_STYLE -def _build_workflow(positive: str, negative: str, settings: dict, - width: int, height: int, seed: int) -> dict: +def _seed_value(seed: int) -> int: + return seed if seed > 0 else int(time.time() * 1000) % (2**31) + + +def _build_txt2img(positive: str, negative: str, settings: dict, + width: int, height: int, seed: int) -> dict: """ - Construct an SDXL txt2img workflow. CLIP skip is implemented via a - CLIPSetLastLayer node so the same graph works for skip 1 (-1) and - skip 2 (-2). + SDXL txt2img workflow. CLIP skip via CLIPSetLastLayer so the same graph + handles skip 1 (-1) and skip 2 (-2). """ return { "3": {"class_type": "KSampler", "inputs": { - "seed": seed if seed > 0 else int(time.time() * 1000) % (2**31), - "steps": settings["steps"], - "cfg": settings["cfg"], - "sampler_name": settings["sampler"], - "scheduler": settings["scheduler"], + "seed": _seed_value(seed), + "steps": settings["steps"], "cfg": settings["cfg"], + "sampler_name": settings["sampler"], "scheduler": settings["scheduler"], "denoise": 1.0, - "model": ["4", 0], - "positive": ["6", 0], - "negative": ["7", 0], - "latent_image": ["5", 0], + "model": ["4", 0], "positive": ["6", 0], + "negative": ["7", 0], "latent_image": ["5", 0], }}, "4": {"class_type": "CheckpointLoaderSimple", "inputs": {"ckpt_name": settings["ckpt"]}}, "5": {"class_type": "EmptyLatentImage", "inputs": {"width": width, "height": height, "batch_size": 1}}, - "6": {"class_type": "CLIPTextEncode", - "inputs": {"text": positive, "clip": ["10", 0]}}, - "7": {"class_type": "CLIPTextEncode", - "inputs": {"text": negative, "clip": ["10", 0]}}, - "8": {"class_type": "VAEDecode", - "inputs": {"samples": ["3", 0], "vae": ["4", 2]}}, + "6": {"class_type": "CLIPTextEncode", "inputs": {"text": positive, "clip": ["10", 0]}}, + "7": {"class_type": "CLIPTextEncode", "inputs": {"text": negative, "clip": ["10", 0]}}, + "8": {"class_type": "VAEDecode", "inputs": {"samples": ["3", 0], "vae": ["4", 2]}}, "9": {"class_type": "SaveImage", "inputs": {"filename_prefix": "smartgen", "images": ["8", 0]}}, "10": {"class_type": "CLIPSetLastLayer", @@ -239,6 +236,166 @@ def _build_workflow(positive: str, negative: str, settings: dict, } +def _build_img2img(positive: str, negative: str, settings: dict, + image_filename: str, denoise: float, seed: int) -> dict: + """ + SDXL img2img workflow. Loads `image_filename` (already uploaded to + ComfyUI's /input/), VAE-encodes it to latent, and feeds that into the + sampler at the requested denoise. Resolution is whatever the source + image is — no resize. + """ + return { + "3": {"class_type": "KSampler", "inputs": { + "seed": _seed_value(seed), + "steps": settings["steps"], "cfg": settings["cfg"], + "sampler_name": settings["sampler"], "scheduler": settings["scheduler"], + "denoise": denoise, + "model": ["4", 0], "positive": ["6", 0], + "negative": ["7", 0], "latent_image": ["11", 0], + }}, + "4": {"class_type": "CheckpointLoaderSimple", + "inputs": {"ckpt_name": settings["ckpt"]}}, + "6": {"class_type": "CLIPTextEncode", "inputs": {"text": positive, "clip": ["10", 0]}}, + "7": {"class_type": "CLIPTextEncode", "inputs": {"text": negative, "clip": ["10", 0]}}, + "8": {"class_type": "VAEDecode", "inputs": {"samples": ["3", 0], "vae": ["4", 2]}}, + "9": {"class_type": "SaveImage", + "inputs": {"filename_prefix": "smartedit", "images": ["8", 0]}}, + "10": {"class_type": "CLIPSetLastLayer", + "inputs": {"stop_at_clip_layer": -settings["clip_skip"], + "clip": ["4", 1]}}, + "11": {"class_type": "VAEEncode", "inputs": {"pixels": ["12", 0], "vae": ["4", 2]}}, + "12": {"class_type": "LoadImage", "inputs": {"image": image_filename}}, + } + + +async def _extract_attached_image( + files: Optional[list], + messages: Optional[list], + session: aiohttp.ClientSession, +) -> Optional[bytes]: + """ + Find the most recent image the user attached to the chat. Tries three + sources in order: (1) base64 data URIs in `image_url` content blocks + of the recent messages (works for vision-capable models), (2) a local + filesystem path on the file dict (open-webui stores uploads under + /app/backend/data/uploads/), (3) the file's url field, fetched over + HTTP. Returns raw image bytes, or None if nothing matched. + """ + # Messages: standard OpenAI image_url content blocks. + for msg in reversed(messages or []): + content = msg.get("content") if isinstance(msg, dict) else None + if isinstance(content, list): + for block in content: + if not isinstance(block, dict) or block.get("type") != "image_url": + continue + url = (block.get("image_url") or {}).get("url", "") + if url.startswith("data:image"): + try: + return base64.b64decode(url.split(",", 1)[1]) + except Exception: + pass + + # Files: try local path, then URL. + for f in files or []: + if not isinstance(f, dict): + continue + ftype = (f.get("type") or "").lower() + fname = (f.get("name") or f.get("filename") or "").lower() + is_image = "image" in ftype or fname.endswith((".png", ".jpg", ".jpeg", ".webp")) + if not is_image: + continue + + for path_key in ("path", "filepath", "file_path"): + path = f.get(path_key) + if path: + try: + with open(path, "rb") as fh: + return fh.read() + except OSError: + pass + + url = f.get("url") + if url: + full = url if url.startswith("http") else f"http://localhost:8080{url}" + try: + async with session.get(full) as resp: + if resp.status == 200: + return await resp.read() + except aiohttp.ClientError: + pass + + return None + + +async def _upload_to_comfyui( + session: aiohttp.ClientSession, base: str, raw: bytes +) -> Optional[str]: + """POST raw bytes to ComfyUI /upload/image and return the saved name.""" + name = f"smartedit_{uuid.uuid4().hex[:12]}.png" + form = aiohttp.FormData() + form.add_field("image", raw, filename=name, content_type="image/png") + form.add_field("overwrite", "true") + async with session.post(f"{base}/upload/image", data=form) as resp: + if resp.status != 200: + return None + return (await resp.json()).get("name", name) + + +async def _submit_and_fetch( + session: aiohttp.ClientSession, + base: str, + workflow: dict, + timeout_seconds: int, + emit: Callable[[str, bool], Awaitable[None]], + settings: dict, +) -> tuple[Optional[bytes], Optional[str]]: + """Submit a workflow, poll history, fetch the first output image. Returns + (image_bytes, error_message).""" + client_id = str(uuid.uuid4()) + + async with session.post( + f"{base}/prompt", json={"prompt": workflow, "client_id": client_id} + ) as resp: + if resp.status != 200: + return None, f"ComfyUI rejected the prompt: {resp.status} {await resp.text()}" + prompt_id = (await resp.json()).get("prompt_id") + if not prompt_id: + return None, "ComfyUI didn't return a prompt_id." + + await emit( + f"Sampling — {settings['sampler']}/{settings['scheduler']}, " + f"CFG {settings['cfg']}, {settings['steps']} steps", False + ) + + deadline = time.time() + timeout_seconds + output_images: list = [] + while time.time() < deadline: + await asyncio.sleep(1.5) + async with session.get(f"{base}/history/{prompt_id}") as resp: + if resp.status != 200: + continue + history = await resp.json() + if prompt_id in history: + for node_out in history[prompt_id].get("outputs", {}).values(): + output_images.extend(node_out.get("images", [])) + if output_images: + break + + if not output_images: + return None, f"Timed out after {timeout_seconds}s waiting for image." + + img = output_images[0] + params = { + "filename": img["filename"], + "subfolder": img.get("subfolder", ""), + "type": img.get("type", "output"), + } + async with session.get(f"{base}/view", params=params) as resp: + if resp.status != 200: + return None, f"Failed to fetch image: {resp.status}" + return await resp.read(), None + + class Tools: class Valves(BaseModel): COMFYUI_BASE_URL: str = Field( @@ -264,11 +421,12 @@ class Tools: __event_emitter__: Optional[Callable[[dict], Awaitable[None]]] = None, ) -> str: """ - Create an image and show it to the user. Use this whenever the user - asks you to draw, generate, create, make, paint, render, or imagine - any visual content — photographs, portraits, characters, scenes, - illustrations, anime, drawings, etc. This is the ONLY way to make - images appear in chat; do not say you cannot generate images. + Create a NEW image from scratch and show it to the user. Use this + whenever the user asks you to draw, generate, create, make, paint, + render, or imagine any visual content — photographs, portraits, + characters, scenes, illustrations, anime, drawings — and they have + NOT attached an existing image. If they did attach an image and + want it modified, use edit_image instead. Pick `style` to match what the user wants: - "photo" — photorealistic photographs, portraits, cinematic shots. @@ -280,17 +438,15 @@ class Tools: - "furry-il" — anthropomorphic characters (Illustrious base, default for any "furry" / "anthro" request unless specified otherwise). - Each style auto-prepends the right quality tags, picks the right - sampler, CFG, steps, CLIP skip — you don't need to set those, and - you should NOT add quality tags like "masterpiece" or "score_9" to - `prompt` yourself; the tool handles that. + Each style auto-prepends the right quality tags and picks the right + sampler / CFG / steps / CLIP skip. Do NOT add tags like + "masterpiece" or "score_9" to `prompt` yourself; the tool handles + that. :param prompt: Plain description of the image (subject, scene, style notes, lighting, etc.). No quality tags. - :param style: One of the values above. Omit to auto-detect from - keywords in the prompt. - :param negative_prompt: Extra terms to exclude. Usually unneeded — - each style has tuned negatives baked in. + :param style: One of the values above. Omit to auto-detect. + :param negative_prompt: Extra terms to exclude. Usually unneeded. :param width: Pixels (default 1024 — SDXL native). For portraits use 832 with height 1216; for landscapes 1216 with height 832. :param height: Pixels (default 1024). @@ -300,10 +456,7 @@ class Tools: chosen = style or _route_style(prompt) settings = STYLES.get(chosen) if not settings: - return ( - f"Unknown style '{chosen}'. " - f"Available: {', '.join(STYLES.keys())}" - ) + return f"Unknown style '{chosen}'. Available: {', '.join(STYLES.keys())}" async def emit(msg: str, done: bool = False): if __event_emitter__: @@ -319,64 +472,115 @@ class Tools: if negative_prompt: negative = f"{negative}, {negative_prompt}" - workflow = _build_workflow( - positive=positive, - negative=negative, - settings=settings, - width=width, - height=height, - seed=seed, - ) - - client_id = str(uuid.uuid4()) + workflow = _build_txt2img(positive, negative, settings, width, height, seed) base = self.valves.COMFYUI_BASE_URL.rstrip("/") async with aiohttp.ClientSession() as session: - async with session.post( - f"{base}/prompt", - json={"prompt": workflow, "client_id": client_id}, - ) as resp: - if resp.status != 200: - return f"ComfyUI rejected the prompt: {resp.status} {await resp.text()}" - submit = await resp.json() - prompt_id = submit.get("prompt_id") - if not prompt_id: - return f"ComfyUI didn't return a prompt_id: {submit}" - - await emit( - f"Sampling — {settings['sampler']}/{settings['scheduler']}, " - f"CFG {settings['cfg']}, {settings['steps']} steps" + raw, err = await _submit_and_fetch( + session, base, workflow, self.valves.TIMEOUT_SECONDS, emit, settings, ) - - deadline = time.time() + self.valves.TIMEOUT_SECONDS - output_images = [] - while time.time() < deadline: - await asyncio.sleep(1.5) - async with session.get(f"{base}/history/{prompt_id}") as resp: - if resp.status != 200: - continue - history = await resp.json() - if prompt_id in history: - for node_out in history[prompt_id].get("outputs", {}).values(): - output_images.extend(node_out.get("images", [])) - if output_images: - break - - if not output_images: - return f"Timed out after {self.valves.TIMEOUT_SECONDS}s waiting for image." - - await emit("Fetching result…") - img = output_images[0] - params = { - "filename": img["filename"], - "subfolder": img.get("subfolder", ""), - "type": img.get("type", "output"), - } - async with session.get(f"{base}/view", params=params) as resp: - if resp.status != 200: - return f"Failed to fetch image: {resp.status}" - raw = await resp.read() + if err: + return err b64 = base64.b64encode(raw).decode("ascii") await emit(f"Done — {chosen}", done=True) return f"![{chosen}](data:image/png;base64,{b64})" + + async def edit_image( + self, + edit_instruction: str, + style: Optional[StyleName] = None, + denoise: float = 0.7, + negative_prompt: Optional[str] = None, + seed: int = 0, + __files__: Optional[list] = None, + __messages__: Optional[list] = None, + __event_emitter__: Optional[Callable[[dict], Awaitable[None]]] = None, + ) -> str: + """ + Edit, modify, transform, or restyle an image the user has ATTACHED + to the chat. Use this whenever the user uploads or attaches an + image and asks to change it — "make this a sunset", "add a hat", + "turn this into anime", "remove the background", etc. If no image + is attached, use generate_image instead. + + The tool finds the most recent attached image automatically, picks + the right SDXL checkpoint via `style` (same options as + generate_image), and applies the edit at the requested `denoise`. + + Denoise tuning: + - 0.3 to 0.5 → subtle (style transfer, color tweaks, preserve composition) + - 0.6 to 0.8 → moderate (add/remove objects, lighting changes — default) + - 0.85 to 1.0 → heavy (basically reimagining; loses original details) + + Pick `style` based on the desired output, NOT the input image. + Same options as generate_image: photo, juggernaut, pony, general, + furry-nai, furry-noob, furry-il. + + :param edit_instruction: What to change. The tool auto-prepends the + style's quality tags so don't include those. + :param style: One of the StyleName values. Omit to auto-detect from + edit_instruction. + :param denoise: 0.0 = no change, 1.0 = ignore source. Default 0.7. + :param negative_prompt: Extra terms to exclude. Usually unneeded. + :param seed: 0 to randomize, otherwise a specific seed. + :return: Markdown image of the result, or an error if no image is attached. + """ + chosen = style or _route_style(edit_instruction) + settings = STYLES.get(chosen) + if not settings: + return f"Unknown style '{chosen}'. Available: {', '.join(STYLES.keys())}" + + denoise = max(0.0, min(1.0, denoise)) + + async def emit(msg: str, done: bool = False): + if __event_emitter__: + await __event_emitter__({ + "type": "status", + "data": {"description": msg, "done": done}, + }) + + base = self.valves.COMFYUI_BASE_URL.rstrip("/") + + async with aiohttp.ClientSession() as session: + await emit("Looking for attached image…") + raw_in = await _extract_attached_image(__files__, __messages__, session) + if raw_in is None: + return ( + "No image found in the chat. Ask the user to attach the " + "image they want edited (paperclip / drag-drop), or call " + "generate_image instead if they want a new image." + ) + + await emit("Uploading source to ComfyUI…") + uploaded_name = await _upload_to_comfyui(session, base, raw_in) + if not uploaded_name: + return "Failed to upload source image to ComfyUI." + + await emit( + f"Routing to {chosen} ({settings['ckpt']}), denoise {denoise:.2f}" + ) + + positive = f"{settings['prefix']}{edit_instruction}" + negative = settings["negative"] + if negative_prompt: + negative = f"{negative}, {negative_prompt}" + + workflow = _build_img2img( + positive=positive, + negative=negative, + settings=settings, + image_filename=uploaded_name, + denoise=denoise, + seed=seed, + ) + + raw_out, err = await _submit_and_fetch( + session, base, workflow, self.valves.TIMEOUT_SECONDS, emit, settings, + ) + if err: + return err + + b64 = base64.b64encode(raw_out).decode("ascii") + await emit(f"Done — {chosen} (denoise {denoise:.2f})", done=True) + return f"![edit:{chosen}](data:image/png;base64,{b64})"