comfyui-nvidia/deployments/ai-stack/openwebui-tools/smart_image_gen.py

"""
title: Smart Image Generator & Editor (ComfyUI)
author: ai-stack
version: 0.7.10
description: Generate or edit images via ComfyUI with automatic SDXL
    checkpoint routing. Two methods — generate_image (txt2img) and
    edit_image (img2img on the user's most recently attached image). The
    LLM picks (or auto-detects) the right model — photoreal, Pony
    score-tag, NoobAI/Illustrious furry, etc. — and each style ships
    with the creator-recommended sampler, scheduler, CFG, steps, CLIP
    skip, prompt-prefix dialect, and negatives. The image is uploaded
    to Open WebUI's file store and surfaced via a `files` event (the
    canonical pattern used by Open WebUI's own image-gen path); the
    function return is a short confirmation so the LLM doesn't try to
    describe or re-emit the image.
required_open_webui_version: 0.5.0
"""

import asyncio
import base64
import inspect
import io
import json
import re
import time
import uuid
from typing import Awaitable, Callable, Literal, Optional

import aiohttp
from pydantic import BaseModel, Field

# Open WebUI's runtime — only available when the tool is loaded inside the
# Open WebUI process. Guarded so the module still imports for standalone
# linting/testing; if the imports fail at runtime, _push_image_to_chat
# falls back to emitting a markdown data-URI message.
try:
    from fastapi import UploadFile
    from open_webui.models.chats import Chats
    from open_webui.models.files import Files
    from open_webui.models.users import Users
    from open_webui.routers.files import upload_file_handler

    _OPENWEBUI_RUNTIME = True
except ImportError:
    _OPENWEBUI_RUNTIME = False

StyleName = Literal[
    "photo", "juggernaut", "pony", "general",
    "furry-nai", "furry-noob", "furry-il",
]


# ─────────────────────────────────────────────────────────────────────────────
# Per-style settings — sampler/scheduler/cfg/steps/clip_skip/prefix/negatives
# come from each model's creator page on Civitai. Three prefix dialects in
# play: photoreal (no prefix, natural language), Pony score chain (REQUIRED
# for any Pony-derived checkpoint), and Booru quality tags (NoobAI /
# Illustrious lineage). Never cross-contaminate.
# ─────────────────────────────────────────────────────────────────────────────

STYLES = {
    "photo": {
        "ckpt":      "CyberRealisticXLPlay_V8.0_FP16.safetensors",
        "sampler":   "dpmpp_2m_sde",
        "scheduler": "karras",
        "cfg":       4.0,
        "steps":     28,
        "clip_skip": 1,
        "prefix":    "",  # natural language only — no quality tags
        "negative": (
            "cartoon, drawing, illustration, anime, manga, painting, sketch, "
            "render, 3d, cgi, watercolor, plastic skin, doll-like, oversaturated, "
            "lowres, blurry, jpeg artifacts, noisy, grainy, low quality, worst quality, "
            "bad anatomy, deformed, mutated, extra limbs, extra fingers, missing fingers, "
            "fused fingers, malformed hands, asymmetric face, "
            "watermark, signature, text, logo, label, username"
        ),
    },
    "juggernaut": {
        "ckpt":      "Juggernaut-XL_v9_RunDiffusionPhoto_v2.safetensors",
        "sampler":   "dpmpp_2m_sde",
        "scheduler": "karras",
        "cfg":       4.5,
        "steps":     35,
        "clip_skip": 1,
        "prefix":    "",  # natural language only
        "negative": (
            "cartoon, drawing, illustration, anime, manga, painting, sketch, "
            "render, 3d, cgi, plastic skin, washed out, oversaturated, "
            "lowres, blurry, jpeg artifacts, low quality, worst quality, "
            "bad anatomy, deformed, mutated, extra limbs, extra fingers, missing fingers, "
            "fused fingers, malformed hands, "
            "watermark, signature, text, logo, username"
        ),
    },
    "pony": {
        "ckpt":      "ponyDiffusionV6XL_v6StartWithThisOne.safetensors",
        "sampler":   "euler_ancestral",
        "scheduler": "normal",
        "cfg":       7.5,
        "steps":     25,
        "clip_skip": 2,
        # REQUIRED — the full chain. Just `score_9` alone is much weaker.
        "prefix":    "score_9, score_8_up, score_7_up, score_6_up, score_5_up, score_4_up, ",
        # Pony's creator notes negatives are usually unnecessary; conservative
        # baseline only. Source-toggle tags (source_pony/furry/anime/cartoon)
        # are intentionally omitted — they exclude entire content domains.
        "negative": (
            "score_6, score_5, score_4, "
            "worst quality, low quality, lowres, blurry, jpeg artifacts, noisy, "
            "bad anatomy, bad proportions, bad hands, extra digit, fewer digits, "
            "fused fingers, malformed limbs, deformed, ugly, "
            "censored, monochrome, "
            "watermark, signature, text, logo, artist name, patreon username, twitter username"
        ),
    },
    "general": {
        "ckpt":      "talmendoxlSDXL_v11Beta.safetensors",
        "sampler":   "dpmpp_2m",
        "scheduler": "karras",
        "cfg":       8.0,  # Talmendo wants notably higher CFG than the others
        "steps":     30,
        "clip_skip": 2,
        "prefix":    "",  # creator says don't push "masterpiece" — fights the amateur aesthetic
        "negative": (
            "lowres, blurry, jpeg artifacts, noisy, grainy, low quality, worst quality, "
            "bad anatomy, deformed, mutated, extra limbs, missing fingers, fused fingers, "
            "malformed hands, ugly, "
            "watermark, signature, text, logo"
        ),
    },
    "furry-nai": {
        "ckpt":      "reedFURRYMixSDXL_v23nai.safetensors",
        "sampler":   "euler_ancestral",
        "scheduler": "normal",
        "cfg":       5.0,
        "steps":     30,
        "clip_skip": 2,
        "prefix": (
            "masterpiece, best quality, high quality, good quality, "
            "detailed eyes, highres, absurdres, incredibly absurdres, "
        ),
        "negative": (
            "worst quality, bad_quality, normal quality, lowres, anatomical nonsense, "
            "bad anatomy, anatomical nonsense, interlocked fingers, extra fingers, "
            "watermark, simple background, transparent, bad_feet, bad_hands, "
            "logo, text, bad_anatomy, signature, face backlighting, "
            "(worst quality, bad quality:1.2), jpeg artifacts, censored, "
            "extra digit, ugly, deformed anatomy, bad proportions, "
        ),
    },
    "furry-noob": {
        "ckpt":      "indigoVoidFurryFusedXL_noobaiV32.safetensors",
        "sampler":   "euler_ancestral",  # creator: other samplers won't work
        "scheduler": "normal",
        "cfg":       4.5,
        "steps":     20,
        "clip_skip": 2,
        "prefix": (
            "masterpiece, best quality, perfect quality, absurdres, newest, "
            "very aesthetic, vibrant colors, "
        ),
        "negative": (
            "human, realistic, photorealistic, 3d, cgi, "
            "shiny skin, shiny clothing, "
            "worst quality, low quality, lowres, blurry, jpeg artifacts, noisy, "
            "bad anatomy, bad hands, mutated hands, bad proportions, "
            "extra digit, fewer digits, fused fingers, malformed limbs, deformed, ugly, "
            "watermark, signature, text, logo, username, artist signature"
        ),
    },
    "furry-il": {
        "ckpt":      "novaFurryXL_ilV170.safetensors",
        "sampler":   "euler_ancestral",
        "scheduler": "normal",
        "cfg":       4.0,
        "steps":     30,
        "clip_skip": 2,
        # Illustrious wants `newest` in positive and `old`/`oldest` in negative
        # — these are year-bucket tags from the training set. `furry` and
        # `anthro` are universally helpful here.
        "prefix": (
            "masterpiece, best quality, amazing quality, very aesthetic, "
            "high resolution, ultra-detailed, absurdres, newest, furry, anthro, "
        ),
        "negative": (
            "human, multiple tails, modern, recent, old, oldest, "
            "graphic, cartoon, painting, crayon, graphite, abstract, glitch, "
            "deformed, mutated, ugly, disfigured, long body, conjoined, "
            "lowres, bad anatomy, bad hands, missing fingers, extra digits, fewer digits, "
            "cropped, very displeasing, worst quality, bad quality, sketch, "
            "jpeg artifacts, signature, watermark, username, text, simple background, "
            "bad ai-generated"
        ),
    },
}

DEFAULT_STYLE = "general"

# First-match-wins keyword router used when the caller didn't pick a style.
# Order matters — narrower patterns above broader ones.
ROUTING_RULES = [
    # Pony score chain is the single strongest signal — Pony only
    (re.compile(r"\bscore_\d", re.I),                                       "pony"),
    (re.compile(r"\bpony\b",   re.I),                                       "pony"),
    # NoobAI / Illustrious explicit mentions
    (re.compile(r"\b(noobai|noob)\b", re.I),                                "furry-noob"),
    (re.compile(r"\b(illustrious|ilxl)\b", re.I),                           "furry-il"),
    # Generic furry — defaults to NovaFurry (Illustrious lineage, current sweet spot)
    (re.compile(r"\b(furry|anthro|feral|kemono|fursona|species)\b", re.I),  "furry-il"),
    # Photo / photoreal
    (re.compile(r"\b(juggernaut)\b", re.I),                                 "juggernaut"),
    (re.compile(r"\b(photo|photograph|realistic|portrait|selfie|cinematic)\b", re.I), "photo"),
    # Generic anime / illustration → Pony covers anime well
    (re.compile(r"\b(anime|manga|2d|illustration)\b", re.I),                "pony"),
]


def _route_style(prompt: str) -> str:
    for pattern, style in ROUTING_RULES:
        if pattern.search(prompt):
            return style
    return DEFAULT_STYLE


def _inherited_style(messages: Optional[list]) -> Optional[str]:
    """
    Return the `style` arg from the most recent generate_image /
    edit_image tool call in the conversation. Used so edit_image can
    auto-inherit the style of the image being edited when the LLM
    didn't pass one explicitly — without this, an edit on a furry
    image with a neutral edit prompt ("make the eyes glow") falls
    through to the keyword router and picks a wrong style.
    """
    if not messages:
        return None
    for msg in reversed(messages):
        if not isinstance(msg, dict):
            continue
        for tc in (msg.get("tool_calls") or []):
            if not isinstance(tc, dict):
                continue
            fn = tc.get("function") or {}
            if fn.get("name") not in ("generate_image", "edit_image"):
                continue
            raw_args = fn.get("arguments")
            if isinstance(raw_args, str):
                try:
                    args = json.loads(raw_args)
                except (TypeError, ValueError):
                    args = {}
            elif isinstance(raw_args, dict):
                args = raw_args
            else:
                args = {}
            style = args.get("style")
            if isinstance(style, str) and style in STYLES:
                return style
    return None


def _seed_value(seed: int) -> int:
    return seed if seed > 0 else int(time.time() * 1000) % (2**31)


def _job_prefix(kind: str) -> str:
    """Per-submission filename_prefix so SaveImage outputs from concurrent
    jobs can never share an auto-numbered counter and cross over."""
    return f"{kind}_{uuid.uuid4().hex[:10]}"


def _build_txt2img(positive: str, negative: str, settings: dict,
                   width: int, height: int, seed: int) -> dict:
    """
    SDXL txt2img workflow. CLIP skip via CLIPSetLastLayer so the same graph
    handles skip 1 (-1) and skip 2 (-2).
    """
    return {
        "3": {"class_type": "KSampler", "inputs": {
            "seed": _seed_value(seed),
            "steps": settings["steps"], "cfg": settings["cfg"],
            "sampler_name": settings["sampler"], "scheduler": settings["scheduler"],
            "denoise": 1.0,
            "model": ["4", 0], "positive": ["6", 0],
            "negative": ["7", 0], "latent_image": ["5", 0],
        }},
        "4": {"class_type": "CheckpointLoaderSimple",
              "inputs": {"ckpt_name": settings["ckpt"]}},
        "5": {"class_type": "EmptyLatentImage",
              "inputs": {"width": width, "height": height, "batch_size": 1}},
        "6": {"class_type": "CLIPTextEncode", "inputs": {"text": positive, "clip": ["10", 0]}},
        "7": {"class_type": "CLIPTextEncode", "inputs": {"text": negative, "clip": ["10", 0]}},
        "8": {"class_type": "VAEDecode", "inputs": {"samples": ["3", 0], "vae": ["4", 2]}},
        "9": {"class_type": "SaveImage",
              "inputs": {"filename_prefix": _job_prefix("smartgen"), "images": ["8", 0]}},
        "10": {"class_type": "CLIPSetLastLayer",
               "inputs": {"stop_at_clip_layer": -settings["clip_skip"],
                          "clip": ["4", 1]}},
    }


def _build_inpaint(positive: str, negative: str, settings: dict,
                   image_filename: str, mask_text: str,
                   denoise: float, seed: int) -> dict:
    """
    SDXL inpainting workflow with text-driven masking. Uses
    comfyui_segment_anything (GroundingDINO + SAM-HQ — installed by the
    Dockerfile) to derive a mask from `mask_text` (a noun phrase like
    "the dog's collar"), then SetLatentNoiseMask + KSampler repaint
    only that region. Everything outside the mask stays pixel-perfect.

    The raw SAM mask is run through GrowMask with tapered_corners
    before it reaches the sampler. Without that, the mask edge is
    pixel-binary and KSampler repaints right up to a hard boundary —
    SDXL has no surrounding-pixel context inside the mask to blend
    with, so the inpainted region looks pasted-on with visible seams.
    expand=12px + taper gives a soft transition that blends naturally.

    First inpaint downloads ~3 GB of SAM/GroundingDINO weights into
    /opt/comfyui/models/{sams,grounding-dino}/ — subsequent runs reuse
    them.
    """
    return {
        "3": {"class_type": "KSampler", "inputs": {
            "seed": _seed_value(seed),
            "steps": settings["steps"], "cfg": settings["cfg"],
            "sampler_name": settings["sampler"], "scheduler": settings["scheduler"],
            "denoise": denoise,
            "model": ["4", 0], "positive": ["6", 0],
            "negative": ["7", 0], "latent_image": ["13", 0],
        }},
        "4": {"class_type": "CheckpointLoaderSimple",
              "inputs": {"ckpt_name": settings["ckpt"]}},
        "6": {"class_type": "CLIPTextEncode", "inputs": {"text": positive, "clip": ["10", 0]}},
        "7": {"class_type": "CLIPTextEncode", "inputs": {"text": negative, "clip": ["10", 0]}},
        "8": {"class_type": "VAEDecode", "inputs": {"samples": ["3", 0], "vae": ["4", 2]}},
        "9": {"class_type": "SaveImage",
              "inputs": {"filename_prefix": _job_prefix("smartinpaint"), "images": ["8", 0]}},
        "10": {"class_type": "CLIPSetLastLayer",
               "inputs": {"stop_at_clip_layer": -settings["clip_skip"],
                          "clip": ["4", 1]}},
        "11": {"class_type": "VAEEncode", "inputs": {"pixels": ["12", 0], "vae": ["4", 2]}},
        "12": {"class_type": "LoadImage", "inputs": {"image": image_filename}},
        "13": {"class_type": "SetLatentNoiseMask",
               "inputs": {"samples": ["11", 0], "mask": ["17", 0]}},
        "14": {"class_type": "SAMModelLoader (segment anything)",
               "inputs": {"model_name": "sam_hq_vit_h (2.57GB)"}},
        "15": {"class_type": "GroundingDinoModelLoader (segment anything)",
               "inputs": {"model_name": "GroundingDINO_SwinT_OGC (694MB)"}},
        "16": {"class_type": "GroundingDinoSAMSegment (segment anything)",
               "inputs": {
                   "sam_model": ["14", 0],
                   "grounding_dino_model": ["15", 0],
                   "image": ["12", 0],
                   "prompt": mask_text,
                   "threshold": 0.3,
               }},
        "17": {"class_type": "GrowMask",
               "inputs": {
                   "mask": ["16", 1],
                   "expand": 12,
                   "tapered_corners": True,
               }},
    }


def _build_img2img(positive: str, negative: str, settings: dict,
                   image_filename: str, denoise: float, seed: int) -> dict:
    """
    SDXL img2img workflow. Loads `image_filename` (already uploaded to
    ComfyUI's /input/), VAE-encodes it to latent, and feeds that into the
    sampler at the requested denoise. Resolution is whatever the source
    image is — no resize.
    """
    return {
        "3": {"class_type": "KSampler", "inputs": {
            "seed": _seed_value(seed),
            "steps": settings["steps"], "cfg": settings["cfg"],
            "sampler_name": settings["sampler"], "scheduler": settings["scheduler"],
            "denoise": denoise,
            "model": ["4", 0], "positive": ["6", 0],
            "negative": ["7", 0], "latent_image": ["11", 0],
        }},
        "4": {"class_type": "CheckpointLoaderSimple",
              "inputs": {"ckpt_name": settings["ckpt"]}},
        "6": {"class_type": "CLIPTextEncode", "inputs": {"text": positive, "clip": ["10", 0]}},
        "7": {"class_type": "CLIPTextEncode", "inputs": {"text": negative, "clip": ["10", 0]}},
        "8": {"class_type": "VAEDecode", "inputs": {"samples": ["3", 0], "vae": ["4", 2]}},
        "9": {"class_type": "SaveImage",
              "inputs": {"filename_prefix": _job_prefix("smartedit"), "images": ["8", 0]}},
        "10": {"class_type": "CLIPSetLastLayer",
               "inputs": {"stop_at_clip_layer": -settings["clip_skip"],
                          "clip": ["4", 1]}},
        "11": {"class_type": "VAEEncode", "inputs": {"pixels": ["12", 0], "vae": ["4", 2]}},
        "12": {"class_type": "LoadImage", "inputs": {"image": image_filename}},
    }


def _file_dict_is_image(f: dict) -> bool:
    ftype = (f.get("type") or "").lower()
    fname = (f.get("name") or f.get("filename") or "").lower()
    return "image" in ftype or fname.endswith((".png", ".jpg", ".jpeg", ".webp"))


_FILE_URL_ID_RE = re.compile(r"/(?:api/v1/)?files/([0-9a-fA-F-]{8,})(?:/content)?")


async def _read_file_dict(f: dict) -> Optional[bytes]:
    """
    Try to read raw bytes for one file dict. Tries in order:
      1. Local filesystem path keys (covers user uploads with `path`).
      2. Open WebUI's Files.get_file_by_id with f["id"] (covers files
         the user uploaded via the file API).
      3. Same lookup with the id parsed out of f["url"] (covers
         assistant-emitted files where the message attachment is just
         {"type":"image","url":"/api/v1/files/<uuid>/content"} —
         no id field, no path field, but the URL has the id).

    Async because Open WebUI 0.9.0 made every model-class accessor
    a coroutine (Users / Chats / Files / etc.). Calling the sync
    way returns a coroutine object instead of the model — silently
    breaks downstream attribute access. Same reason the callers in
    _extract_attached_image and _push_image_to_chat must await.
    """
    for path_key in ("path", "filepath", "file_path"):
        path = f.get(path_key)
        if path:
            try:
                with open(path, "rb") as fh:
                    return fh.read()
            except OSError:
                pass

    candidate_ids = []
    if f.get("id"):
        candidate_ids.append(f["id"])
    url = f.get("url")
    if url:
        m = _FILE_URL_ID_RE.search(url)
        if m:
            candidate_ids.append(m.group(1))

    if _OPENWEBUI_RUNTIME:
        for fid in candidate_ids:
            try:
                file_model = await Files.get_file_by_id(fid)
                if file_model is None:
                    continue
                path = getattr(file_model, "path", None)
                if not path:
                    meta = getattr(file_model, "meta", None) or {}
                    if isinstance(meta, dict):
                        path = meta.get("path")
                    else:
                        path = getattr(meta, "path", None)
                if path:
                    try:
                        with open(path, "rb") as fh:
                            return fh.read()
                    except OSError:
                        pass
            except Exception:
                pass

    return None


async def _extract_attached_image(
    files: Optional[list],
    messages: Optional[list],
    metadata: Optional[dict],
    session: aiohttp.ClientSession,
) -> Optional[bytes]:
    """
    Find the most recent image in the chat — including images previously
    emitted by this tool itself. Search order (most recent first):

      1. Inline base64 data URIs in `image_url` content blocks of recent
         messages (vision-model uploads, paste-from-clipboard).
      2. Files attached to messages in the conversation, scanned in
         REVERSE so the newest image wins. This covers two cases:
           a. Files the user just attached (current user message).
           b. Files the assistant emitted via prior `generate_image` /
              `edit_image` calls (attached to assistant messages by the
              `files` event in _push_image_to_chat).
      3. The __files__ tool param as a final fallback (some Open WebUI
         versions pass user uploads here instead of on the message).
      4. Best-effort URL fetch on any leftover file dict (likely fails
         on auth-protected endpoints — last resort).
    """
    # 1. Inline data URIs on recent messages.
    for msg in reversed(messages or []):
        content = msg.get("content") if isinstance(msg, dict) else None
        if isinstance(content, list):
            for block in content:
                if not isinstance(block, dict) or block.get("type") != "image_url":
                    continue
                url = (block.get("image_url") or {}).get("url", "")
                if url.startswith("data:image"):
                    try:
                        return base64.b64decode(url.split(",", 1)[1])
                    except Exception:
                        pass

    # 2. Files on messages, newest first.
    for msg in reversed(messages or []):
        if not isinstance(msg, dict):
            continue
        msg_files = msg.get("files")
        if not isinstance(msg_files, list):
            continue
        for f in msg_files:
            if not isinstance(f, dict) or not _file_dict_is_image(f):
                continue
            data = await _read_file_dict(f)
            if data is not None:
                return data

    # 3. __files__ param (current user upload, sometimes only here).
    for f in files or []:
        if not isinstance(f, dict) or not _file_dict_is_image(f):
            continue
        data = await _read_file_dict(f)
        if data is not None:
            return data

    # 4. Pull the chat from the database directly. Open WebUI persists
    # `files` on every message via the upsert in socket/main.py — so even
    # if __messages__ doesn't hydrate the assistant-emitted attachments,
    # the chat record does. This is the strongest fallback.
    if _OPENWEBUI_RUNTIME and metadata:
        chat_id = metadata.get("chat_id")
        if chat_id:
            try:
                chat = await Chats.get_chat_by_id(chat_id)
                chat_data = getattr(chat, "chat", None) if chat else None
                chat_messages = (chat_data or {}).get("messages", []) if isinstance(chat_data, dict) else []
                for msg in reversed(chat_messages):
                    if not isinstance(msg, dict):
                        continue
                    msg_files = msg.get("files") or []
                    for f in msg_files:
                        if not isinstance(f, dict) or not _file_dict_is_image(f):
                            continue
                        data = await _read_file_dict(f)
                        if data is not None:
                            return data
            except Exception:
                pass

    # 5. Last-resort URL fetch (no auth — only works for public endpoints).
    for source in [files or []] + [
        (msg.get("files") or []) for msg in reversed(messages or []) if isinstance(msg, dict)
    ]:
        for f in source:
            if not isinstance(f, dict) or not _file_dict_is_image(f):
                continue
            url = f.get("url")
            if not url:
                continue
            full = url if url.startswith("http") else f"http://localhost:8080{url}"
            try:
                async with session.get(full) as resp:
                    if resp.status == 200:
                        return await resp.read()
            except aiohttp.ClientError:
                pass

    return None


async def _upload_to_comfyui(
    session: aiohttp.ClientSession, base: str, raw: bytes
) -> Optional[str]:
    """POST raw bytes to ComfyUI /upload/image and return the saved name."""
    name = f"smartedit_{uuid.uuid4().hex[:12]}.png"
    form = aiohttp.FormData()
    form.add_field("image", raw, filename=name, content_type="image/png")
    form.add_field("overwrite", "true")
    async with session.post(f"{base}/upload/image", data=form) as resp:
        if resp.status != 200:
            return None
        return (await resp.json()).get("name", name)


async def _push_image_to_chat(
    raw: bytes,
    filename_prefix: str,
    request,
    user_dict: Optional[dict],
    metadata: Optional[dict],
    event_emitter: Optional[Callable[[dict], Awaitable[None]]],
) -> bool:
    """
    Surface a generated image in the chat using Open WebUI's canonical
    pattern: upload the bytes via the internal file store, then emit a
    `files` event referencing the served URL. This is the same path Open
    WebUI's own image-generation code uses (utils/middleware.py ~1325).

    Returns True if the image was uploaded and emitted via the files
    event. Returns False if anything is missing — caller should fall
    back to a data-URI markdown message in that case.
    """
    if not (_OPENWEBUI_RUNTIME and request and user_dict and event_emitter):
        return False

    try:
        user = await Users.get_user_by_id(user_dict.get("id"))
        if not user:
            return False

        upload = UploadFile(
            file=io.BytesIO(raw),
            filename=f"{filename_prefix}_{uuid.uuid4().hex[:8]}.png",
            headers={"content-type": "image/png"},
        )
        meta = metadata or {}
        result = upload_file_handler(
            request=request,
            file=upload,
            metadata={
                "chat_id":    meta.get("chat_id"),
                "message_id": meta.get("message_id"),
            },
            process=False,
            user=user,
        )
        # upload_file_handler may be sync or async depending on the Open
        # WebUI version — handle either.
        if inspect.iscoroutine(result):
            file_item = await result
        else:
            file_item = result

        url = request.app.url_path_for(
            "get_file_content_by_id", id=file_item.id
        )

        await event_emitter({
            "type": "files",
            "data": {"files": [{"type": "image", "url": url}]},
        })
        return True
    except Exception:
        # Any failure (signature drift, missing route, etc.) falls back
        # to the data-URI path in the caller.
        return False


async def _submit_and_fetch(
    session: aiohttp.ClientSession,
    base: str,
    workflow: dict,
    timeout_seconds: int,
    emit: Callable[[str, bool], Awaitable[None]],
    settings: dict,
) -> tuple[Optional[bytes], Optional[str]]:
    """Submit a workflow, poll history, fetch the first output image. Returns
    (image_bytes, error_message)."""
    client_id = str(uuid.uuid4())

    async with session.post(
        f"{base}/prompt", json={"prompt": workflow, "client_id": client_id}
    ) as resp:
        if resp.status != 200:
            return None, f"ComfyUI rejected the prompt: {resp.status} {await resp.text()}"
        prompt_id = (await resp.json()).get("prompt_id")
        if not prompt_id:
            return None, "ComfyUI didn't return a prompt_id."

    await emit(
        f"Sampling — {settings['sampler']}/{settings['scheduler']}, "
        f"CFG {settings['cfg']}, {settings['steps']} steps", False
    )

    # The SaveImage node in every workflow this tool builds is id "9".
    # We prefer it explicitly because intermediate nodes (e.g. the
    # GroundingDinoSAMSegment IMAGE output in the inpaint workflow) can
    # land in the outputs dict too, and dict iteration order is not
    # stable across runs — without preferring "9" we sometimes returned
    # an overlay or masked-only image that rendered mostly black.
    SAVE_NODE_ID = "9"

    deadline = time.time() + timeout_seconds
    output_images: list = []
    while time.time() < deadline:
        await asyncio.sleep(1.5)
        async with session.get(f"{base}/history/{prompt_id}") as resp:
            if resp.status != 200:
                continue
            history = await resp.json()
        if prompt_id in history:
            outputs = history[prompt_id].get("outputs", {}) or {}
            # Prefer the canonical SaveImage output …
            save_imgs = (outputs.get(SAVE_NODE_ID) or {}).get("images", [])
            if save_imgs:
                output_images.extend(save_imgs)
            # … only fall back to other nodes if SaveImage didn't fire
            # (workflow drift, manual override, etc.)
            if not output_images:
                for node_out in outputs.values():
                    output_images.extend(node_out.get("images", []))
            if output_images:
                break

    if not output_images:
        return None, f"Timed out after {timeout_seconds}s waiting for image."

    img = output_images[0]
    params = {
        "filename": img["filename"],
        "subfolder": img.get("subfolder", ""),
        "type": img.get("type", "output"),
    }
    async with session.get(f"{base}/view", params=params) as resp:
        if resp.status != 200:
            return None, f"Failed to fetch image: {resp.status}"
        return await resp.read(), None


class Tools:
    class Valves(BaseModel):
        COMFYUI_BASE_URL: str = Field(
            default="http://comfyui:8188",
            description="ComfyUI server URL reachable from the open-webui container.",
        )
        TIMEOUT_SECONDS: int = Field(
            default=600,
            description=(
                "Maximum wait for a single generation to complete. "
                "Default 10 minutes — long enough to absorb a first-time "
                "inpaint where SAM-HQ + GroundingDINO + BERT auto-download "
                "(~3 GB). Steady-state runs finish in well under a minute; "
                "if your KSampler routinely takes longer than that, lower "
                "the per-style steps in STYLES."
            ),
        )

    def __init__(self):
        self.valves = self.Valves()

    async def generate_image(
        self,
        prompt: str,
        style: Optional[StyleName] = None,
        negative_prompt: Optional[str] = None,
        width: int = 1024,
        height: int = 1024,
        seed: int = 0,
        __request__=None,
        __user__: Optional[dict] = None,
        __metadata__: Optional[dict] = None,
        __event_emitter__: Optional[Callable[[dict], Awaitable[None]]] = None,
    ) -> str:
        """
        Create a NEW image from scratch and show it to the user. Use this
        whenever the user asks you to draw, generate, create, make, paint,
        render, or imagine any visual content — photographs, portraits,
        characters, scenes, illustrations, anime, drawings — and they have
        NOT attached an existing image. If they did attach an image and
        want it modified, use edit_image instead.

        Pick `style` to match what the user wants:
        - "photo" — photorealistic photographs, portraits, cinematic shots.
        - "juggernaut" — alternate photoreal style (sharper, more saturated).
        - "pony" — anime / illustration / cartoon (Pony Diffusion).
        - "general" — fallback for anything that doesn't fit the others.
        - "furry-nai" — anthropomorphic characters (NAI-trained mix).
        - "furry-noob" — anthropomorphic characters (NoobAI base).
        - "furry-il" — anthropomorphic characters (Illustrious base, default
          for any "furry" / "anthro" request unless specified otherwise).

        Each style auto-prepends the right quality tags and picks the right
        sampler / CFG / steps / CLIP skip. Do NOT add tags like
        "masterpiece" or "score_9" to `prompt` yourself; the tool handles
        that.

        :param prompt: Plain description of the image (subject, scene,
            style notes, lighting, etc.). No quality tags.
        :param style: One of the values above. Omit to auto-detect.
        :param negative_prompt: Extra terms to exclude. Usually unneeded.
        :param width: Pixels (default 1024 — SDXL native). For portraits
            use 832 with height 1216; for landscapes 1216 with height 832.
        :param height: Pixels (default 1024).
        :param seed: 0 to randomize, otherwise a specific seed for repeats.
        :return: Markdown image of the result.
        """
        chosen = style or _route_style(prompt)
        settings = STYLES.get(chosen)
        if not settings:
            return f"Unknown style '{chosen}'. Available: {', '.join(STYLES.keys())}"

        async def emit(msg: str, done: bool = False):
            if __event_emitter__:
                await __event_emitter__({
                    "type": "status",
                    "data": {"description": msg, "done": done},
                })

        await emit(f"Routing to {chosen} ({settings['ckpt']})")

        positive = f"{settings['prefix']}{prompt}"
        negative = settings["negative"]
        if negative_prompt:
            negative = f"{negative}, {negative_prompt}"

        workflow = _build_txt2img(positive, negative, settings, width, height, seed)
        base = self.valves.COMFYUI_BASE_URL.rstrip("/")

        async with aiohttp.ClientSession() as session:
            raw, err = await _submit_and_fetch(
                session, base, workflow, self.valves.TIMEOUT_SECONDS, emit, settings,
            )
        if err:
            return err

        # Surface the image in the chat. Preferred path uploads to Open
        # WebUI's file store and emits a `files` event (matches the built-
        # in image-gen flow). Fallback inlines a data-URI markdown via a
        # `message` event for environments where the file API isn't
        # reachable from the tool process.
        pushed = await _push_image_to_chat(
            raw, "smartgen", __request__, __user__, __metadata__, __event_emitter__,
        )
        if not pushed and __event_emitter__:
            b64 = base64.b64encode(raw).decode("ascii")
            await __event_emitter__({
                "type": "message",
                "data": {"content": f"![{chosen}](data:image/png;base64,{b64})"},
            })

        await emit(f"Done — {chosen}", done=True)
        return (
            f"Image generated and shown to the user above (style: {chosen}, "
            f"checkpoint: {settings['ckpt']}). Do NOT describe the image, "
            f"do NOT repeat any base64 or markdown — the user can see it. "
            f"You may briefly note your style choice and offer one or two "
            f"iteration ideas (different style, tighter framing, etc)."
        )

    async def edit_image(
        self,
        prompt: str,
        style: Optional[StyleName] = None,
        mask_text: Optional[str] = None,
        denoise: Optional[float] = None,
        negative_prompt: Optional[str] = None,
        seed: int = 0,
        __request__=None,
        __user__: Optional[dict] = None,
        __metadata__: Optional[dict] = None,
        __files__: Optional[list] = None,
        __messages__: Optional[list] = None,
        __event_emitter__: Optional[Callable[[dict], Awaitable[None]]] = None,
    ) -> str:
        """
        Edit, modify, transform, or restyle an image the user has ATTACHED
        to the chat. Use whenever the user uploads an image and asks to
        change it. If no image is attached, use generate_image instead.

        TWO MODES — choose based on whether the change is local or global:

        - LOCAL change ("change the ball to a basketball", "make the dog
          wear a hat", "remove the bird") → set `mask_text` to a brief
          noun phrase describing the region ("the ball", "the dog", "the
          bird"). The tool uses GroundingDINO+SAM to find that region
          automatically and only that area is repainted; the rest of the
          image stays pixel-perfect.

        - GLOBAL change ("make this a sunset", "turn this into anime",
          "restyle this as oil painting") → leave `mask_text` unset. The
          whole image is reimagined via img2img.

        Always prefer LOCAL mode when the user names a specific object,
        person, or region. GLOBAL mode is for whole-image style/lighting
        transformations.

        Denoise tuning:
        - LOCAL (mask_text set): default 1.0 — full repaint within mask.
          Drop to 0.6–0.8 for subtle local edits that should retain some
          original structure.
        - GLOBAL (no mask_text): default 0.7 — moderate edit. Use 0.3–0.5
          for subtle restyling, 0.85–1.0 for radical reimagining.

        Pick `style` for the DESIRED OUTPUT, not the input image.

        Style resolution order: inherited from the most recent prior
        generate_image / edit_image call in this conversation (DOMINANT)
        → explicit `style` arg → keyword detection on `prompt`.
        Inheritance dominates because vision LLMs misclassify subjects
        in the rendered output (e.g. picking 'juggernaut' on a
        'furry-il' source). For follow-up edits on an image you
        generated earlier, omit `style` entirely — the tool reuses the
        established style automatically. The user can start a new chat
        if they want a different style.

        :param prompt: What the changed area should look like.
            Tool auto-prepends quality tags — don't include those.
        :param style: One of the StyleName values. Omit to auto-inherit
            from the previous tool call (recommended for edits on
            images you generated earlier in this chat).
        :param mask_text: Noun phrase describing the region to edit. Set
            for LOCAL changes; omit for GLOBAL.
        :param denoise: 0.0 = no change, 1.0 = ignore source. Defaults to
            1.0 with mask_text, 0.7 without.
        :param negative_prompt: Extra terms to exclude. Usually unneeded.
        :param seed: 0 to randomize, otherwise specific.
        :return: Markdown image of the result, or an error if no image is attached.
        """
        # Resolve style — inheritance DOMINATES for edits. Vision LLMs
        # misclassify subject types (observed in the wild: juggernaut
        # picked for a furry-il source because the model thought the
        # rendered character looked "photoreal-ish"). When there's a
        # prior tool call in this chat, use the same style; the user's
        # workaround for genuine style changes is a fresh chat.
        chosen = _inherited_style(__messages__) or style or _route_style(prompt)
        settings = STYLES.get(chosen)
        if not settings:
            return f"Unknown style '{chosen}'. Available: {', '.join(STYLES.keys())}"

        # Denoise default depends on mode: 1.0 (full repaint within mask)
        # for inpainting, 0.7 for img2img.
        if denoise is None:
            denoise = 1.0 if mask_text else 0.7
        denoise = max(0.0, min(1.0, denoise))

        async def emit(msg: str, done: bool = False):
            if __event_emitter__:
                await __event_emitter__({
                    "type": "status",
                    "data": {"description": msg, "done": done},
                })

        base = self.valves.COMFYUI_BASE_URL.rstrip("/")

        async with aiohttp.ClientSession() as session:
            await emit("Looking for attached image…")
            raw_in = await _extract_attached_image(
                __files__, __messages__, __metadata__, session,
            )
            if raw_in is None:
                msgs_with_files = sum(
                    1 for m in (__messages__ or [])
                    if isinstance(m, dict) and m.get("files")
                )
                chat_id_present = bool((__metadata__ or {}).get("chat_id"))
                return (
                    "No image found in the chat. Diagnostics: "
                    f"__files__={len(__files__ or [])}, "
                    f"__messages__={len(__messages__ or [])} "
                    f"(of which {msgs_with_files} had a files field), "
                    f"chat_id_present={chat_id_present}, "
                    f"openwebui_runtime={_OPENWEBUI_RUNTIME}. "
                    "Ask the user to attach the image they want edited "
                    "(paperclip / drag-drop), or call generate_image instead."
                )

            # Diagnostic emit so a misrouted source ("wrong image
            # returned") shows up in the status track instead of being
            # invisible. SHA-1 is fast and the first 8 hex chars are
            # plenty to compare against the prior generation's hash if
            # cross-talk is suspected.
            import hashlib  # local import — keeps the module import surface clean
            src_hash = hashlib.sha1(raw_in).hexdigest()[:8]
            await emit(f"Uploading source to ComfyUI… (sha1={src_hash}, {len(raw_in)} bytes)")
            uploaded_name = await _upload_to_comfyui(session, base, raw_in)
            if not uploaded_name:
                return "Failed to upload source image to ComfyUI."

            mode = "inpaint" if mask_text else "img2img"
            await emit(
                f"Routing to {chosen} ({settings['ckpt']}), {mode}, denoise {denoise:.2f}"
                + (f", mask='{mask_text}'" if mask_text else "")
            )

            positive = f"{settings['prefix']}{prompt}"
            negative = settings["negative"]
            if negative_prompt:
                negative = f"{negative}, {negative_prompt}"

            if mask_text:
                workflow = _build_inpaint(
                    positive=positive,
                    negative=negative,
                    settings=settings,
                    image_filename=uploaded_name,
                    mask_text=mask_text,
                    denoise=denoise,
                    seed=seed,
                )
            else:
                workflow = _build_img2img(
                    positive=positive,
                    negative=negative,
                    settings=settings,
                    image_filename=uploaded_name,
                    denoise=denoise,
                    seed=seed,
                )

            raw_out, err = await _submit_and_fetch(
                session, base, workflow, self.valves.TIMEOUT_SECONDS, emit, settings,
            )
        if err:
            return err

        pushed = await _push_image_to_chat(
            raw_out, "smartedit", __request__, __user__, __metadata__, __event_emitter__,
        )
        if not pushed and __event_emitter__:
            b64 = base64.b64encode(raw_out).decode("ascii")
            await __event_emitter__({
                "type": "message",
                "data": {"content": f"![edit:{chosen}](data:image/png;base64,{b64})"},
            })

        await emit(f"Done — {chosen} (denoise {denoise:.2f})", done=True)
        return (
            f"Edited image shown to the user above (style: {chosen}, "
            f"checkpoint: {settings['ckpt']}, denoise: {denoise:.2f}). Do NOT "
            f"describe the image, do NOT repeat any base64 or markdown — the "
            f"user can see it. You may briefly note your choice and offer "
            f"iterations (different denoise, alternate style, etc)."
        )